# <p style="text-align: center;">RNA Knowledge Graph Build Data Preparation</p>
    
***
***

**Authors:** [ECavalleri](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=emanuele.cavalleri@unimi.it), [TJCallahan](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=callahantiff@gmail.com)

**GitHub Repositories:** [RNA-KG](https://github.com/AnacletoLAB/RNA-KG/), [PheKnowLator](https://github.com/callahantiff/PheKnowLator/)  
<!--- **Release:** **[v2.0.0](https://github.com/callahantiff/PheKnowLator/wiki/v2.0.0)** --->
  
<br>  
  
**Purpose:** This notebook serves as a script to download, process, map, and clean data in order to build edges for the RNA-centered Knowledge Graph.

<br>

**Assumptions:**   
- Edge data downloads ➞ `./resources/edge_data`  
- Ontologies ➞ `./resources/ontologies`    
- Processed data write location ➞ `./resources/processed_data`  

<br>

**Dependencies:**   
- **Scripts**: This notebook utilizes several helper functions, which are stored in the [`data_utils.py`](https://github.com/callahantiff/PheKnowLator/blob/master/pkt_kg/utils/data_utils.py) and [`kg_utils.py`](https://github.com/callahantiff/PheKnowLator/blob/master/pkt_kg/utils/kg_utils.py) scripts.  
- **Data**: All downloaded and generated data sources are provided through [10.5281/zenodo.10078877](https://doi.org/10.5281/zenodo.10078877) dedicated repository. <u>This notebook will download everything that is needed for you</u>.  
_____
***

## Table of Contents
***

### [Download Ontologies](#create-ontologies)


### [Create Identifier Maps](#create-identifier-maps)   


### [Download and process Edge Datasets](#create-edges)  

____
***

## Set-Up Environment
_____

In [None]:
%%capture
import sys
!{sys.executable} -m pip install -r requirements.txt
sys.path.append('../')

In [None]:
# import needed libraries
import datetime
import glob
import itertools
import networkx
import numpy
import os
import pickle
import re
import requests
import tarfile
import shutil
import pandas as pd
import gffpandas.gffpandas as gffpd
import numpy as np
pd.set_option('display.max_columns', None)
import re

from collections import Counter
from functools import reduce
from rdflib import Graph, Namespace, URIRef, BNode, Literal
from rdflib.namespace import OWL, RDF, RDFS
from reactome2py import content
from tqdm import tqdm
from typing import Dict

from pkt_kg.utils import * 
from builds.ontology_cleaning import *

from Bio import SeqIO, Entrez

from Bio.SeqIO.FastaIO import SimpleFastaParser

from typing import Tuple

#### Define Global Variables

In [None]:
# directory to store resources
resource_data_location = '../resources/'    

# directory to use for unprocessed data
unprocessed_data_location = '../resources/processed_data/unprocessed_data/'

# directory to use for processed data
processed_data_location = '../resources/processed_data/'

# directory to write relations data to
relations_data_location = '../resources/relations_data/'

# directory to write ontology data to
ontology_data_location = '../resources/ontologies/'

# directory to write edges data to
edge_data_location = '../resources/edge_data/'

# processed data url 
processed_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/processed_data/'

# original data url 
original_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/original_data/'

# owltools location
owltools_location = '../pkt_kg/libs/owltools'

***
***
## DOWNLOAD AND PROCESS ONTOLOGIES  <a class="anchor" id="create-ontologies"></a>
***
***

In [None]:
onto_list = ['ro', 'chebi', 'pr', 'mondo', 'go/extensions/go-plus', 'pw', 'so', 'hp/hp-international', 'uberon', 'vo', 'clo']

for ontology in onto_list:
    data_downloader('http://purl.obolibrary.org/obo/' + ontology + '.owl', ontology_data_location)

# For compatibility with the PheKnowLator ecosystem, we rename the ontology files to match the naming convention
for filename in os.listdir(ontology_data_location):
    if filename.endswith(".owl"):
        new_filename = filename.replace(".owl", "_with_imports.owl")
        os.rename(os.path.join(ontology_data_location, filename), os.path.join(ontology_data_location, new_filename))

# PRO is already edited by PheKnowLator in order to contain only terms specifying human proteins
data_downloader(processed_url + 'pr_with_imports.owl', ontology_data_location)

os.rename(os.path.join(ontology_data_location, 'go-plus_with_imports.owl'), os.path.join(ontology_data_location, 'go_with_imports.owl'))
os.rename(os.path.join(ontology_data_location, 'hp-international_with_imports.owl'), os.path.join(ontology_data_location, 'hp_with_imports.owl'))
os.rename(os.path.join(ontology_data_location, 'uberon_with_imports.owl'), os.path.join(ontology_data_location, 'ext_with_imports.owl'))

### Identify Relations and Inverse Relations

In [None]:
ro_graph = Graph().parse(ontology_data_location + 'ro_with_imports.owl')

with open(relations_data_location + 'INVERSE_RELATIONS.txt', 'w') as outfile:
    outfile.write('Relation' + '\t' + 'Inverse_Relation' + '\n')
    for s, p, o in tqdm(ro_graph):
        if 'owl#inverseOf' in str(p):
            if 'RO' in str(s) and 'RO' in str(o):
                outfile.write(str(s.split('/')[-1]) + '\t' + str(o.split('/')[-1]) + '\n')
                outfile.write(str(o.split('/')[-1]) + '\t' + str(s.split('/')[-1]) + '\n')

ro_data = pd.read_csv(relations_data_location + 'INVERSE_RELATIONS.txt', header=0, delimiter='\t')

print('There are {edge_count} RO Relations and Inverse Relations'.format(edge_count=len(ro_data)))
print(ro_data.head(n=5))

results = {str(x[2]).lower(): str(x[0]) for x in ro_graph if '/RO_' in str(x[0]) and 'label' in str(x[1]).lower()}

with open(relations_data_location + 'RELATIONS_LABELS.txt', 'w') as outfile:
    outfile.write('Label' + '\t' + 'Relation' + '\n')
    for k, v in results.items():
        outfile.write(str(v).split('/')[-1] + '\t' + str(k) + '\n')

ro_data_label = pd.read_csv(relations_data_location + 'RELATIONS_LABELS.txt', header=0, delimiter='\t')

print('There are {edge_count} RO Relations and Labels'.format(edge_count=len(ro_data_label)))
print(ro_data_label.head(n=5))

At this point, please run the [<tt>Ontology_Cleaning.ipynb</tt>](https://github.com/callahantiff/PheKnowLator/blob/master/notebooks/Ontology_Cleaning.ipynb) notebook provided by PKT.

***
***
## DOWNLOAD AND CREATE MAPPING DATASETS  <a class="anchor" id="create-identifier-maps"></a>
***
***

### Mappings provided by the PheKnowLator ecosystem

In [None]:
for edge in ['ENTREZ_GENE_PRO_ONTOLOGY_MAP.txt',
'ENTREZ_GENE_ENSEMBL_TRANSCRIPT_MAP.txt',
'HPA_GTEX_RNA_GENE_PROTEIN_EDGES.txt',
'UNIPROT_PROTEIN_CATALYST.txt',
'HPA_GTEX_RNA_GENE_PROTEIN_EDGES.txt',
'UNIPROT_PROTEIN_COFACTOR.txt',
'HPA_GTEX_RNA_GENE_PROTEIN_EDGES.txt',
'ENSEMBL_TRANSCRIPT_PROTEIN_ONTOLOGY_MAP.txt',
'CLINVAR_VARIANT_GENE_DISEASE_PHENOTYPE_EDGES.txt']:
    data_downloader(processed_url+edge, processed_data_location)

for map_txt in ['DISEASE_MONDO_MAP.txt', 'ENSEMBL_GENE_ENTREZ_GENE_MAP.txt',
            'ENTREZ_GENE_PRO_ONTOLOGY_MAP.txt', 'GENE_SYMBOL_ENSEMBL_TRANSCRIPT_MAP.txt',
            'MESH_CHEBI_MAP.txt', 'ENTREZ_GENE_ENSEMBL_TRANSCRIPT_MAP.txt', 'STRING_PRO_ONTOLOGY_MAP.txt',
            'UNIPROT_ACCESSION_PRO_ONTOLOGY_MAP.txt','REACTOME_PW_GO_MAPPINGS.txt']:
        data_downloader(processed_url+map_txt, processed_data_location)

***
### New mappings

***
### Chemical labels+synonyms from ChEBI - ChEBI mapping


**Purpose:** To map Chemical labels+synonyms from ChEBI to ChEBI identifiers.

**Output:** `DESC_CHEBI_MAP.txt` + `SYN_CHEBI_MAP.txt`

In [None]:
# Get dbxrefs for all ontology classes' label
def gets_ontology_class_label(graph: Graph) -> Tuple:
    dbx_uris: Dict = dict()
    dbx = [x for x in graph if 'label' in str(x[1]).lower() if isinstance(x[0], URIRef)]
    for x in dbx:
        if str(x[2]).lower() in dbx_uris.keys(): dbx_uris[str(x[2]).lower()].append(str(x[0]))
        else: dbx_uris[str(x[2]).lower()] = [str(x[0])]
    dbx_type = {str(x[2]).lower(): 'DbXref' for x in dbx}

    ex_uris: Dict = dict()
    ex = [x for x in graph if 'exactmatch' in str(x[1]).lower() if isinstance([0], URIRef)]
    for x in ex:
        if str(x[2]).lower() in ex_uris.keys(): ex_uris[str(x[2]).lower()].append(str(x[0]))
        else: ex_uris[str(x[2]).lower()] = [str(x[0])]
    ex_type = {str(x[2]).lower(): 'ExactMatch' for x in ex}

    return {**dbx_uris, **ex_uris}, {**dbx_type, **ex_type}

In [None]:
# Get dbxrefs for all ontology classes' label
def gets_ontology_class_synonym(graph: Graph) -> Tuple:
    dbx_uris: Dict = dict()
    dbx = [x for x in graph if 'synonym' in str(x[1]).lower() if isinstance(x[0], URIRef)]
    for x in dbx:
        if str(x[2]).lower() in dbx_uris.keys(): dbx_uris[str(x[2]).lower()].append(str(x[0]))
        else: dbx_uris[str(x[2]).lower()] = [str(x[0])]
    dbx_type = {str(x[2]).lower(): 'DbXref' for x in dbx}

    ex_uris: Dict = dict()
    ex = [x for x in graph if 'exactmatch' in str(x[1]).lower() if isinstance([0], URIRef)]
    for x in ex:
        if str(x[2]).lower() in ex_uris.keys(): ex_uris[str(x[2]).lower()].append(str(x[0]))
        else: ex_uris[str(x[2]).lower()] = [str(x[0])]
    ex_type = {str(x[2]).lower(): 'ExactMatch' for x in ex}

    return {**dbx_uris, **ex_uris}, {**dbx_type, **ex_type}

In [None]:
# Get label+synonym look-up table for an ontology
def gets_ontology_lookup(ontology_name, with_import=True) :
    # with_import --> integrated ontologies; without_import --> ontologies used to standardize edge metadata
    if with_import :
        graph = Graph().parse(ontology_data_location + ontology_name + '_with_imports.owl')
    else :
        graph = Graph().parse(ontology_data_location + ontology_name + '.owl')

    label = gets_ontology_class_label(graph)[0]
    graph_dict = {str(k): {str(i).split('/')[-1] for i in v} for k, v in label.items()}

    with open(unprocessed_data_location + 'DESC_' + ontology_name.upper() + '_MAP.txt', 'w') as outfile:
        for k, v in {**graph_dict}.items():
            outfile.write(str(k) + '\t' + str(v).replace('{','').replace('\'','').replace('}','') + '\n')

    desc_map = pd.read_csv(unprocessed_data_location+'DESC_' + ontology_name.upper() + '_MAP.txt',
                           header=None, delimiter='\t')
    desc_map[1] = desc_map[1].str.split(', ')
    desc_map = desc_map.explode(1)

    syn = gets_ontology_class_synonym(graph)[0]
    graph_dict = {str(k): {str(i).split('/')[-1] for i in v} for k, v in syn.items()}

    with open(unprocessed_data_location + 'SYN_' + ontology_name.upper() + '_MAP.txt', 'w') as outfile:
        for k, v in {**graph_dict}.items():
            outfile.write(str(k) + '\t' + str(v).replace('{','').replace('\'','').replace('}','') + '\n')

    syn_map = pd.read_csv(unprocessed_data_location+'SYN_' + ontology_name.upper() + '_MAP.txt',
                          header=None, delimiter='\t')
    syn_map[1] = syn_map[1].str.split(', ')
    syn_map = syn_map.explode(1)
    desc_map = pd.concat([desc_map, syn_map], ignore_index=True).drop_duplicates()
    desc_map.to_csv(processed_data_location + 'DESC_' + ontology_name.upper() + '_MAP.txt',
                    header=None, sep='\t', index=None)
    return desc_map

In [None]:
desc_chebi_map = gets_ontology_lookup('chebi')
desc_chebi_map

In [None]:
# If chuncks above have already been run, uncomment and run the following line to speed up construction:
desc_chebi_map = pd.read_csv(processed_data_location + 'DESC_CHEBI_MAP.txt', header=None, sep='\t')

***
### GO terms' label+synonym from GO - GO mapping


**Purpose:** To map GO terms' label+synonym from GO to GO identifiers.

**Output:** `DESC_GO_MAP.txt` + `SYN_GO_MAP.txt`

In [None]:
desc_go_map = gets_ontology_lookup('go')
desc_go_map

In [None]:
# If chunck above has already been run, uncomment and run the following line to speed up construction:
desc_go_map = pd.read_csv(processed_data_location + 'DESC_GO_MAP.txt', header=None, sep='\t')

***
### Pathways labels from Reactome - Reactome mapping


**Purpose:** To map Reactome pathways labels from Reactome to Reactome identifiers.

**Output:** `DESC_REACTOME_MAP.txt`

In [None]:
data_downloader('https://raw.githubusercontent.com/ComPath/resources/master/mappings/kegg_reactome.csv',
                unprocessed_data_location, 'kegg_reactome.csv')

kegg_reactome_map = pd.read_csv(unprocessed_data_location + 'kegg_reactome.csv', header=0, delimiter=',')[['Source Name','Source ID']]
kegg_reactome_map.columns=[0,1]
kegg_reactome_map[0] = kegg_reactome_map[0].str.lower()
kegg_reactome_map

In [None]:
data_downloader('https://reactome.org/download/current/ReactomePathways.txt', unprocessed_data_location)

reactome_pathways = pd.read_csv(unprocessed_data_location + 'ReactomePathways.txt', header=None, delimiter='\t')
# remove all non-human pathways
reactome_pathways = reactome_pathways[reactome_pathways[2] == 'Homo sapiens'][[0,1]]
reactome_pathways.columns=[1,0]
reactome_pathways[0] = reactome_pathways[0].str.lower()
reactome_pathways

In [None]:
desc_reactome_map = pd.concat([kegg_reactome_map, reactome_pathways])
desc_reactome_map.to_csv(processed_data_location + "DESC_REACTOME_MAP.txt", header=False, sep="\t",index=False)
desc_reactome_map

In [None]:
# If chunck above has already been run, uncomment and run the following line to speed up construction:
desc_reactome_map = pd.read_csv(processed_data_location + 'DESC_REACTOME_MAP.txt', header=None, sep='\t')

***
### Pathways labels from PW - PW mapping


**Purpose:** To map pathways labels from PW to PW identifiers.

**Output:** `DESC_REACTOME_MAP.txt` + `SYN_REACTOME_MAP.txt`

In [None]:
desc_pw_map = gets_ontology_lookup('pw')
desc_pw_map

In [None]:
# If chunck above has already been run, uncomment and run the following line to speed up construction:
desc_pw_map = pd.read_csv(processed_data_location + 'DESC_PW_MAP.txt', header=None, sep='\t')

***
### miRNA - miRBase mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To map miRNA and stem-loop miRNA to miRBase identifiers.

**Output:** `MIRNA_MIRBASE_MAP.txt`

Note: Provided by [miRBase](https://www.mirbase.org/).

In [None]:
data_downloader('https://www.mirbase.org/download/hsa.gff3', unprocessed_data_location)

miRBaseMap = gffpd.read_gff3(unprocessed_data_location + 'hsa.gff3')  
os.remove(unprocessed_data_location + 'hsa.gff3')
print(miRBaseMap.header)
print(miRBaseMap.df)

In [None]:
miRBaseMap = miRBaseMap.attributes_to_columns()
miRBaseMap = miRBaseMap[['attributes']]
miRBaseMap

In [None]:
miRBaseMap = miRBaseMap.attributes.str.split(';',expand=True)
# Keep only "ID" and "Name" columns
miRBaseMap = miRBaseMap[[2,0]]
# Remove substring "ID="
miRBaseMap[0] = miRBaseMap[0].str[3:]
# Remove substring "Name="
miRBaseMap[2] = miRBaseMap[2].str[5:]
miRBaseMap.to_csv(processed_data_location + 'MIRNA_MIRBASE_MAP.txt', header=None, sep='\t', index=None)
miRBaseMap

In [None]:
# If chunck above has already been run, uncomment and run the following line to speed up construction:
miRBaseMap = pd.read_csv(processed_data_location + 'MIRNA_MIRBASE_MAP.txt', header=None, names=[2,0], sep='\t')
# For back-compatibility issue
mirna_mirbase_map = miRBaseMap.copy()

***
### Disease labels+synonyms from Mondo - Mondo mapping


**Purpose:** To map Diseases labels+synonyms from Mondo to Mondo identifiers.

**Output:** `DESC_MONDO_MAP.txt` + `SYN_MONDO_MAP.txt`

In [None]:
desc_mondo_map = gets_ontology_lookup('mondo')
desc_mondo_map

In [None]:
# If chunck above has already been run, uncomment and run the following line to speed up construction:
desc_mondo_map = pd.read_csv(processed_data_location + 'DESC_MONDO_MAP.txt', header=None, sep='\t')

***
### Phenotype labels+synonyms from HPO - HPO mapping


**Purpose:** To map Phenotype labels+synonyms from HPO to HPO identifiers.

**Output:** `DESC_HP_MAP.txt` + `SYN_HP_MAP.txt`

In [None]:
desc_hpo_map = gets_ontology_lookup('hp')
desc_hpo_map

In [None]:
# If chunck above has already been run, uncomment and run the following line to speed up construction:
desc_hpo_map = pd.read_csv(processed_data_location + 'DESC_HP_MAP.txt', header=None, sep='\t')

We merge diseases and phenotypes since they are closely related. Moreover, "x-disease" and "x-phenotype" interactions share the same RO properties. 

In [None]:
desc_disPhe_map = pd.concat([desc_mondo_map, desc_hpo_map]).drop_duplicates()
desc_disPhe_map

***
### Disease Ontology (DO) - MONDO mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To map DO identifiers to MONDO identifiers.

**Output:** `DISEASE_DOID_MONDO_MAP.txt`

In [None]:
mondo_graph = Graph().parse(ontology_data_location + 'mondo_with_imports.owl')

mondo_dbxref = gets_ontology_class_dbxrefs(mondo_graph)[0]

# Fix DOIDs (substitute : with _)
mondo_dict = {str(k).replace(':','_').upper(): {str(i).split('/')[-1].replace(':','_') for i in v}
              for k, v in mondo_dbxref.items() if 'doid' in str(k)}
list({**mondo_dict}.items())[:5]

In [None]:
with open(processed_data_location + 'DOID_MONDO_MAP.txt', 'w') as outfile:
    for k, v in mondo_dict.items():
        outfile.write(str(k) + '\t' + str(v).replace('{','').replace('\'','').replace('}','') + '\n')

In [None]:
doid_mondo_map = pd.read_csv(processed_data_location+'DOID_MONDO_MAP.txt', header=None, delimiter='\t')
doid_mondo_map[1] = doid_mondo_map[1].str.split(', ')
doid_mondo_map = doid_mondo_map.explode(1)
doid_mondo_map

***
### Disease description from DO - DO mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To map Disease descriptions from DO to DO identifiers.

**Output:** None, this mapping will be used only internally.

Note: Provided by [mir2Disease](http://watson.compbio.iupui.edu:8080/miR2Disease/).

In [None]:
data_downloader('http://watson.compbio.iupui.edu:8080/miR2Disease/download/diseaseList.txt', unprocessed_data_location)

In [None]:
desc_do_map = pd.read_csv(unprocessed_data_location + 'diseaseList.txt', sep="\t")
desc_do_map.columns = ['desc', 'doid']
desc_do_map['desc'] = desc_do_map['desc'].str.lower()
desc_do_map['doid'] = desc_do_map['doid'].str.upper().str.replace(':', '_')
desc_do_map

***
### TCGA - MONDO mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To manually map the 32 TCGA cancer types to MONDO ontology.

**Output:** `TCGA_MONDO_MAP.txt`

In [None]:
cancer_mondo_map = pd.DataFrame(data=[['ACC','MONDO_0004971'],
                                 ['BLCA','MONDO_0004163'],
                                 ['BRCA','MONDO_0006256'],
                                 ['CESC','MONDO_0005131'],
                                 ['CHOL','MONDO_0019087'],
                                 ['COAD','MONDO_0002271'],
                                 ['DLBC','MONDO_0018905'],
                                 ['ESCA','MONDO_0019086'],
                                 ['GBM','MONDO_0018177'],
                                 ['HNSC','MONDO_0010150'],
                                 ['KICH','MONDO_0017885'],
                                 ['KIRC','MONDO_0005005'],
                                 ['KIRP','MONDO_0017884'],
                                 ['LGG','MONDO_0005499'],
                                 ['LIHC','MONDO_0007256'],
                                 ['LUAD','MONDO_0005061'],
                                 ['LUSC','MONDO_0005097'],
                                 ['MESO','MONDO_0005065'],
                                 ['OV','MONDO_0006046'],
                                 ['PAAD','MONDO_0006047'],
                                 ['PCPG','MONDO_0035540'],
                                 ['PRAD','MONDO_0005082'],
                                 ['READ','MONDO_0002169'],
                                 ['SARC','MONDO_0005089'],
                                 ['SKCM','MONDO_0005012'],
                                 ['STAD','MONDO_0005036'],
                                 ['TGCT','MONDO_0010108'],
                                 ['THCA','MONDO_0015075'],
                                 ['THYM','MONDO_0006456'],
                                 ['UCEC','MONDO_0000553'],
                                 ['UCS','MONDO_0006485'],
                                 ['UVM','MONDO_0006486']
                                 ])

cancer_mondo_map.to_csv(processed_data_location + 'TCGA_MONDO_MAP.txt', header=None, sep='\t', index=None)

***
### Amino Acid - ChEBI mapping 


**Purpose:** To manually map amino acids ChEBI ontology (SO could've been used too).

**Output:** `AminoAcid_ChEBI_MAP.txt`

In [None]:
aa_chebi_map = pd.DataFrame(data=[['Leu','CHEBI_25017'],
                                 ['Phe','CHEBI_28044'],
                                 ['Ala','CHEBI_16449'],
                                 ['Asn','CHEBI_22653'],
                                 ['Glu','CHEBI_18237'],
                                 ['His','CHEBI_27570'],
                                 ['Asp','CHEBI_22660'],
                                 ['Cys','CHEBI_22660'],
                                 ['Gly','CHEBI_15428'],
                                 ['Ile','CHEBI_24898'],
                                 ['Lys','CHEBI_25094'],
                                 ['Met','CHEBI_16811'],
                                 ['Ser','CHEBI_17822'],
                                 ['Val','CHEBI_27266'],
                                 ['Gln','CHEBI_28300'],
                                 ['Arg','CHEBI_29016'],
                                 ['Pro','CHEBI_26271'],
                                 ['Thr','CHEBI_26986'],
                                 ['iMe','PR_000021937'],
                                 ['Trp','CHEBI_27897'],
                                 ['Tyr','CHEBI_18186']#,
                                 #['Sup','tRNA-Suppressor NOT GROUNDED']
                                 ])

aa_chebi_map.to_csv(processed_data_location + 'AminoAcid_ChEBI_MAP.txt', header=None, sep='\t', index=None)

***
### Gene symbol - PRO mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To map gene symbols to PRO identifiers.

**Output:** `GENE_SYMBOL_PRO_ONTOLOGY_MAP.txt`

In [None]:
symbol_ensembl_map = pd.read_csv(processed_data_location + 'GENE_SYMBOL_ENSEMBL_TRANSCRIPT_MAP.txt', sep="\t",
                                 header=None)
symbol_ensembl_map[[0,1]]

In [None]:
ensembl_pro_map = pd.read_csv(processed_data_location + 'ENSEMBL_TRANSCRIPT_PROTEIN_ONTOLOGY_MAP.txt', sep="\t",
                              header=None)
ensembl_pro_map[[1,0]]

In [None]:
symbol_to_pro = pd.merge(symbol_ensembl_map[[0,1]], ensembl_pro_map[[1,0]], left_on=[1], right_on=[0])
symbol_to_pro = symbol_to_pro[['0_x', '1_y']].drop_duplicates()
symbol_to_pro

In [None]:
symbol_to_pro.drop_duplicates().to_csv(processed_data_location+
                                       'GENE_SYMBOL_PRO_ONTOLOGY_MAP.txt', header=None,
                                       sep='\t', index=None)

In [None]:
# If chunck above has already been run, uncomment and run the following line to speed up construction:
symbol_to_pro = pd.read_csv(processed_data_location+'GENE_SYMBOL_PRO_ONTOLOGY_MAP.txt',names=['0_x','1_y'],sep='\t')

***
### Protein labels+synonyms from PRO - PRO mapping


**Purpose:** To map Protein labels+synonyms from PRO to PRO identifiers.

**Output:** `DESC_PR_MAP.txt` + `SYN_PR_MAP.txt`

Note: The employed PRO ontology is trimmed to contain only human proteins.

In [None]:
desc_pro_map = gets_ontology_lookup('pr')
desc_pro_map

In [None]:
# If chunck above has already been run, uncomment and run the following line to speed up construction:
desc_pro_map = pd.read_csv(processed_data_location + 'DESC_PR_MAP.txt', header=None, sep='\t')

In [None]:
# Remove genes
desc_pro_map = desc_pro_map[~desc_pro_map[1].str.startswith('gene_symbol_report?hgnc_id=')]
desc_pro_map

In [None]:
# We decide to preferentially keep proteins such that human ones have been defined
desc_pro_map_human = desc_pro_map.dropna()[desc_pro_map.dropna()[0].str.contains('human', case=False)]
desc_pro_map_human[0] = desc_pro_map_human[0].str.replace("human ", '')
desc_pro_map_human[0] = desc_pro_map_human[0].str.replace("human", '')
desc_pro_map_human[0] = desc_pro_map_human[0].str.replace(" (", '')
desc_pro_map_human[0] = desc_pro_map_human[0].str.replace(")", '')
desc_pro_map_human[0] = desc_pro_map_human[0].str.replace(",(.*)", '')
desc_pro_map_human[1] = desc_pro_map_human[1].str.split(', ')
desc_pro_map_human = desc_pro_map_human.explode(1)
desc_pro_map_human

In [None]:
desc_pro_map[0] = desc_pro_map[0].str.replace("human ", '')
desc_pro_map[0] = desc_pro_map[0].str.replace("human", '')
desc_pro_map[0] = desc_pro_map[0].str.replace(" (", '')
desc_pro_map[0] = desc_pro_map[0].str.replace(")", '')
desc_pro_map[0] = desc_pro_map[0].str.replace(",(.*)", '')
desc_pro_map[1] = desc_pro_map[1].str.split(', ')
desc_pro_map = desc_pro_map.explode(1)
desc_pro_map = desc_pro_map[~desc_pro_map[0].isin(desc_pro_map_human[0])]
desc_pro_map

In this way (i.e., using this modified look-up table), an entity x will be linked to "double-stranded RNA-activated factor 1 complex (human)" (PR_000027111) instead of "double-stranded RNA-activated factor 1 complex" (PR_000027110).

In [None]:
desc_pro_map = pd.concat([desc_pro_map, desc_pro_map_human]).drop_duplicates()

***
### NCI Thesaurus labels+synonyms from NCIT - NCIT mapping


**Purpose:** To map NCI Thesaurus labels+synonyms from NCIT to NCIT identifiers.

**Output:** `DESC_NCIT_MAP.txt` + `SYN_NCIT_MAP.txt`

Note: This is **not** an integrated ontology, but we use NCIT to standardize edge metadata as much as possible.

In [None]:
data_downloader('http://purl.obolibrary.org/obo/ncit.owl', ontology_data_location)

In [None]:
desc_ncit_map = gets_ontology_lookup('ncit', with_import=False)
desc_ncit_map

In [None]:
# If chuncks above have already been run, uncomment and run the following line to speed up construction:
desc_ncit_map = pd.read_csv(processed_data_location + 'DESC_NCIT_MAP.txt', header=None, sep='\t')

***
### Gene symbol - ENTREZ mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To map gene symbols to ENTREZ identifiers.

**Output:** `GENE_SYMBOL_ENTREZ_ID_MAP.txt`

In [None]:
entrez_enst_map = pd.read_csv(processed_data_location + 'ENTREZ_GENE_ENSEMBL_TRANSCRIPT_MAP.txt', sep="\t", header=None)
entrez_enst_map

In [None]:
symbol_entrez_map = pd.merge(symbol_ensembl_map, entrez_enst_map, on=[1])
symbol_entrez_map = symbol_entrez_map[['0_x','0_y']].drop_duplicates()
symbol_entrez_map

In [None]:
symbol_entrez_map.to_csv(processed_data_location+'GENE_SYMBOL_ENTREZ_ID_MAP.txt',header=None, sep='\t', index=None)

In [None]:
# If chuncks above have already been run, uncomment and run the following line to speed up construction:
symbol_entrez_map = pd.read_csv(processed_data_location+'GENE_SYMBOL_ENTREZ_ID_MAP.txt',names=['0_x','0_y'],sep='\t')

***
### tsRNA - tRNA mapping 

**Purpose:** To map tsRNA to tRNA identifiers.

**Output:** `tRNA_tsRNA_MAP.txt`

Note: Provided by [tsRFun](https://rna.sysu.edu.cn/tsRFun/index.php).

In [None]:
!wget https://rna.sysu.edu.cn/tsRFun/download/newID_20210202.txt

In [None]:
tsRNA_tRF_map = pd.read_csv(unprocessed_data_location + 'newID_20210202.txt', sep="\t")
tsRNA_tRF_map = tsRNA_tRF_map[['tRNA','tsRNAid']]
tsRNA_tRF_map

In [None]:
tsRNA_tRF_map.to_csv(processed_data_location + 'tRNA_tsRNA_MAP.txt', header=None, sep='\t', index=None)

***
### ribozyme - RFAM mapping 

**Purpose:** To map ribozyme to RFSM identifiers.

**Output:** `ribozyme_RFAM_MAP.txt`

In [None]:
ribozyme_rfam_map = pd.DataFrame(data=[['LC ribozyme','family/RF00011'],
                                 ['hammerhead ribozyme','clan/CL00010'],
                                 ['glmS ribozyme','family/RF00234'],
                                 ['HDV-F-prausnitzii','family/RF02682'],
                                 ['HDV ribozyme','family/RF00094'],
                                 ['HDV_ribozyme','family/RF00094'],
                                 ['Hairpin','family/RF00173'],
                                 ['Hammerhead_1','clan/CL00010'],
                                 ['Hammerhead_HH9','clan/CL00010'],
                                 ['Hammerhead_3','clan/CL00010'],
                                 ['Hammerhead_HH10','clan/CL00010'],
                                 ['Hammerhead_II','clan/CL00010'],
                                 ['Pistol','family/RF02679'],
                                 ['Pistol ribozyme','family/RF02679'],
                                 ['twister ribozyme','clan/CL00120'],
                                 ['Twister-P5','clan/CL00120'],
                                 ['Twister-P3','clan/CL00120'],
                                 ['RNAse P','family/RF00009']#,
                                 #['VS ribozyme',''] absent in RFAM
                                 ])

ribozyme_rfam_map.to_csv(processed_data_location + 'ribozyme_RFAM_MAP.txt', header=None, sep='\t', index=None)

***
### MINTbase - GtRNAdb tRNA mapping 

**Purpose:** To map MINTbase to GtRNAdb identifiers.

**Output:** `tRNA_MINTbase_GtRNAdb_MAP.txt`

Note: Provided by [MINTbase](https://cm.jefferson.edu/MINTbase/).

In [None]:
tRNA_MINTbase_GtRNAdb_map = pd.read_csv(unprocessed_data_location + 'MINTbase-gtRNAdb_mapping.txt',sep='\t')
tRNA_MINTbase_GtRNAdb_map = tRNA_MINTbase_GtRNAdb_map[['MINTbase tRNA name','gtRNAdb name']]
tRNA_MINTbase_GtRNAdb_map = tRNA_MINTbase_GtRNAdb_map[tRNA_MINTbase_GtRNAdb_map['gtRNAdb name'] != '-']
tRNA_MINTbase_GtRNAdb_map

In [None]:
tRNA_MINTbase_GtRNAdb_map.to_csv(
    processed_data_location + 'tRNA_MINTbase_GtRNAdb_MAP.txt', header=None, sep='\t', index=None)

***
### Tissue labels+synonyms from Uberon - Uberon mapping


**Purpose:** To map Tissue labels+synonyms from Uberon to Uberon identifiers.

**Output:** `DESC_EXT_MAP.txt` + `SYN_EXT_MAP.txt`

In [None]:
desc_uberon_map = gets_ontology_lookup('ext')
desc_uberon_map

In [None]:
# If chunck above has already been run, uncomment and run the following line to speed up construction:
desc_uberon_map = pd.read_csv(processed_data_location + 'DESC_EXT_MAP.txt', header=None, sep='\t')

***
### Cell line labels+synonyms from CLO - CLO mapping


**Purpose:** To map Cell line labels+synonyms from CLO to CLO identifiers.

**Output:** `DESC_CLO_MAP.txt` + `SYN_CLO_MAP.txt`

In [None]:
desc_clo_map = gets_ontology_lookup('clo')
desc_clo_map

In [None]:
# If chunck above has already been run, uncomment and run the following line to speed up construction:
desc_clo_map = pd.read_csv(processed_data_location + 'DESC_CLO_MAP.txt', header=None, sep='\t')

***
***
## DOWNLOAD AND PROCESS EDGE DATASETS  <a class="anchor" id="create-edges"></a>
***
***

## Edges provided by Human Disease benchmark KG
Here, we adjust edges provided by PheKnowLator ecosystem to make them compliant to RNA-KG identifiers. This is done in order to merge RNA-KG to Human Disease benchmark KG without redundancies or different identifiers referring to the same concept.

In [None]:
for edge in ['CTD_chem_gene_ixns.tsv',
'CTD_chem_go_enriched.tsv',
'ChEBI2Reactome_All_Levels.txt',
'CTD_chemicals_diseases.tsv',
'CTD_chem_gene_ixns.tsv',
'phenotype.hpoa',
'curated_gene_disease_associations.tsv',
'COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt',
'CTD_genes_pathways.tsv',
'curated_gene_disease_associations.tsv',
'gene_association.reactome',
'goa_human.gaf',
'UniProt2Reactome_All_Levels.txt',
'9606.protein.links.v11.0.txt']:
    data_downloader(original_url+edge, edge_data_location)

### Gene-RNA

In [None]:
gene_rna = pd.read_csv(processed_data_location+'ENTREZ_GENE_ENSEMBL_TRANSCRIPT_MAP.txt', sep='\t',header=None)
gene_rna[3].unique()

In [None]:
gene_rna.replace({'processed_transcript':'mRNA',
                  'protein_coding':'mRNA',
                  'non_stop_decay':'mRNA',
                  'nonsense_mediated_decay':'mRNA',
                  'transcribed_processed_pseudogene':'pseudo',
                  'transcribed_unitary_pseudogene':'pseudo',
                  'transcribed_unprocessed_pseudogene':'pseudo',
                  'polymorphic_pseudogene':'pseudo',
                  'unprocessed_pseudogene':'pseudo',
                  'processed_pseudogene':'pseudo',
                  'unitary_pseudogene':'pseudo',
                   'pseudogene':'pseudo',
                  'Mt_tRNA':'mt_tRNA'
                 }, inplace=True)

In [None]:
gene_premiRNA2511 = gene_rna[gene_rna[3] == 'miRNA']
symbol_entrez_map['0_y'] = symbol_entrez_map['0_y'].astype('int64')
gene_premiRNA2511 = pd.merge(gene_premiRNA2511, symbol_entrez_map.rename(columns={'0_y':0}), on=0)

gene_premiRNA2511['0_x'] = 'hsa-' + gene_premiRNA2511['0_x'].str.lower().str.replace(
    'mir','mir-').str.replace('let','let-')

gene_premiRNA2511 = pd.merge(gene_premiRNA2511.rename(columns={'0_x':'a'}), mirna_mirbase_map.rename(columns={2:'a'}), on=['a'])
gene_premiRNA2511

In [None]:
gene_premiRNA2511[['0_x','0_y']].drop_duplicates().to_csv(
    edge_data_location +'Hgene-premiRNA.txt', header=None, sep='\t', index=None)

In [None]:
gene_rna[6] = gene_rna[0].astype(str) + '?' + gene_rna[3].astype(str)

In [None]:
for i in set(gene_rna[3]):
    if i != 'miRNA':
        gene_rna_ = gene_rna[gene_rna[3]==i]

        if not gene_rna_.empty:   
            #print(i)
            #print(gene_rna_[[0,6]].drop_duplicates())
            gene_rna_[[0,6]].drop_duplicates().to_csv(
                edge_data_location + 'Hgene-' + i + '.txt', header=None, sep='\t', index=None)

### RNA-protein

In [None]:
mRNA_protein = pd.read_csv(processed_data_location+'ENSEMBL_TRANSCRIPT_PROTEIN_ONTOLOGY_MAP.txt', sep='\t',header=None)
mRNA_protein = mRNA_protein[mRNA_protein[4] == 'protein-coding']
mRNA_protein = pd.merge(mRNA_protein.rename(columns={0:'a'}), entrez_enst_map.rename(columns={1:'a'}), on='a')[[0, 1]]
mRNA_protein[0] = mRNA_protein[0].astype(str) + '?mRNA' 
mRNA_protein

In [None]:
mRNA_protein.drop_duplicates().to_csv(
    edge_data_location + 'HmRNA-protein.txt', header=None, sep='\t', index=None)

### RNA-anatomy

In [None]:
RNA_anatomy = pd.read_csv(processed_data_location+'HPA_GTEX_RNA_GENE_PROTEIN_EDGES.txt', sep='\t',header=None)
RNA_anatomy = RNA_anatomy[(RNA_anatomy[3]=='Evidence at transcript level') & (RNA_anatomy[4]=='anatomy')]

symbol_entrez_map['0_y'] = symbol_entrez_map['0_y'].astype(str)
gene_rna[0] = gene_rna[0].astype(str)
rna_pro = pd.merge(gene_rna.rename(columns={0:'0_y'}), symbol_entrez_map, on='0_y')
RNA_anatomy = pd.merge(rna_pro.rename(columns={'0_x':'a'}), RNA_anatomy.rename(columns={1:'a'}), on='a')

RNA_anatomy

In [None]:
mirna_anatomy1025 = RNA_anatomy[RNA_anatomy['3_x'] == 'miRNA']
mirna_anatomy1025['a'] = 'hsa-' + mirna_anatomy1025['a'].str.lower().str.replace(
    'mir','mir-').str.replace('let','let-')

mirna_anatomy1025 = pd.merge(mirna_anatomy1025.rename(columns={'0_y':'b'}), mirna_mirbase_map.rename(columns={2:'a'}), on='a')
mirna_anatomy1025

In [None]:
mirna_anatomy1025[['0_y','5_y']].drop_duplicates().to_csv(
    edge_data_location + 'HpremiRNA-anatomy.txt', header=None, sep='\t', index=None)

In [None]:
RNA_anatomy[6] = RNA_anatomy['0_y'].astype(str) + '?' + RNA_anatomy['3_x'].astype(str)

In [None]:
for i in set(RNA_anatomy['3_x']):
    if i != 'miRNA':
        RNA_anatomy_ = RNA_anatomy[RNA_anatomy['3_x']==i]

        if not RNA_anatomy_.empty:   
            #print(i)
            #print(RNA_anatomy_[[6,'5_y']].drop_duplicates())
            RNA_anatomy_[[6,'5_y']].drop_duplicates().to_csv(
                edge_data_location + 'H' + i + '-anatomy.txt', header=None, sep='\t', index=None)

### RNA-cell

In [None]:
RNA_cell = pd.read_csv(processed_data_location+'HPA_GTEX_RNA_GENE_PROTEIN_EDGES.txt', sep='\t',header=None)
RNA_cell = RNA_cell[(RNA_cell[3]=='Evidence at transcript level') & (RNA_cell[4]=='cell line')]

symbol_entrez_map['0_y'] = symbol_entrez_map['0_y'].astype(str)
gene_rna[0] = gene_rna[0].astype(str)
rna_pro = pd.merge(gene_rna.rename(columns={0:'0_y'}), symbol_entrez_map, on='0_y')
RNA_cell = pd.merge(rna_pro.rename(columns={'0_x':'a'}), RNA_cell.rename(columns={1:'a'}), on='a')

RNA_cell

In [None]:
mirna_cell1025 = RNA_cell[RNA_cell['3_x'] == 'miRNA']
mirna_cell1025['a'] = 'hsa-' + mirna_cell1025['a'].str.lower().str.replace(
    'mir','mir-').str.replace('let','let-')

mirna_cell1025 = pd.merge(mirna_cell1025.rename(columns={'0_y':'b'}), mirna_mirbase_map.rename(columns={2:'a'}), on='a')
mirna_cell1025.head()

In [None]:
mirna_cell1025[['0_y','5_y']].drop_duplicates().to_csv(
    edge_data_location + 'HpremiRNA-cell.txt', header=None, sep='\t', index=None)

In [None]:
RNA_cell[6] = RNA_cell['0_y'].astype(str) + '?' + RNA_cell['3_x'].astype(str)

In [None]:
for i in set(RNA_cell['3_x']):
    if i != 'miRNA':
        RNA_cell_ = RNA_cell[RNA_cell['3_x']==i]

        if not RNA_cell_.empty:   
            #print(i)
            #print(RNA_anatomy_[[6,'5_y']].drop_duplicates())
            RNA_cell_[[6,'5_y']].drop_duplicates().to_csv(
                edge_data_location + 'H' + i + '-cell.txt', header=None, sep='\t', index=None)

***
## New edges from RNA sources
Edges are classified according to interactors' types. Each RNA source is then processed if a certain relation is present. Same relations from different sources are matched joining their dataframes after fixing identifiers.

***
### precursor miRNA-miRNA - http://purl.obolibrary.org/obo/RO_0002203 (develops into)
* [miRBase](https://www.mirbase.org/) <br />  The miRBase database is a searchable database of published miRNA sequences and annotation. Each entry represents a predicted hairpin portion of a miRNA transcript (termed mir in the database), with information on the location and sequence of the mature miRNA sequence (termed miR).

In [None]:
data_downloader('https://www.mirbase.org/download/miRNA.dat', processed_data_location)

# Open the EMBL file
embl_file = processed_data_location + 'miRNA.dat'

# Create empty lists to store the data
data = {
    "ID": [],
    "Description": [],
    "Sequence": [],
    "Comments": [],
    "References": [],
    "Feature Table": []
}

# Iterate through the records in the EMBL file
for record in SeqIO.parse(embl_file, "embl"):
    data["ID"].append(record.id)
    data["Description"].append(record.description)
    data["Sequence"].append(str(record.seq))
    data["Comments"].append(str(record.annotations.get('comment', '')))
    references = []
    i = 0
    for ref in record.annotations.get('references', []):
        i = i + 1
        references.append(f"{[i], 'https://pubmed.ncbi.nlm.nih.gov/' + ref.pubmed_id}")
    data["References"].append(", ".join(references))
    feature_table = "\n".join(str(feature) for feature in record.features)
    data["Feature Table"].append(feature_table)

df = pd.DataFrame(data)
df = df[df['Description'].astype(str).str.contains('Homo sapiens')]

df['Feature Table'] = df['Feature Table'].str.split("type: miRNA")
df = df.explode('Feature Table')
df = df[df['Feature Table'] != '']
df

In [None]:
df['Feature Table'] = df['Feature Table'].str.split("\n")
list(df['Feature Table'].loc[57])

In [None]:
def extract_values(row):
    result = {}
    for item in row:
        if "location: " in item:
            key_value = item.split("location: ")
            value = key_value[1]
            result['location'] = value
        elif "Key: " in item:
            key_value = item.split("Key: ")
            key = key_value[1].split(", Value:")[0].strip()
            value = key_value[1].split(", Value:")[1].strip(" ['").strip("'']")
            result[key] = value
    return pd.Series(result)

new_columns = df['Feature Table'].apply(extract_values)

df = pd.concat([df, new_columns], axis=1)

# 'accession' column contains 'product' with miRBase identifiers
df = df.drop(columns = ['product'])

df['Source(s)'] = 'miRBase'
df

Description, Sequence, Comments, References, Feature Table, location, accession, evidence, experiment columns are node properties for premiRNA and miRNA. We can remove them since they are not edge properties.

In [None]:
df = df.drop(columns=['Description', 'Sequence', 'Comments', 'References', 'Feature Table', 'location',
                      'evidence', 'experiment'])
df

In [None]:
premiRNAmiRNA = df.copy()
premiRNAmiRNA[['ID', 'accession', 'Source(s)']].dropna().drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-miRNA.txt', header=None, sep='\t', index=None)

***
### premiRNA-premiRNA - http://purl.obolibrary.org/obo/RO_HOM0000000 (in similarity relationship with)
* [miRBase](https://www.mirbase.org/)

In [None]:
df[['ID', 'similarity', 'Source(s)']].dropna()

In [None]:
df[['ID', 'similarity', 'Source(s)']].dropna().drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-premiRNA.txt', header=None, sep='\t', index=None)

***
### premiRNA-modification (A-to-I) - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)
* [miRBase](https://www.mirbase.org/)

In [None]:
df.mod_base.unique()

In [None]:
df = df[['ID', 'mod_base', 'Source(s)']].dropna()
df.mod_base = 'GO_0006382'
df

In [None]:
df[['ID', 'mod_base', 'Source(s)']].dropna().drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-AtoI.txt', header=None, sep='\t', index=None)

***
### miRNA-gene - http://purl.obolibrary.org/obo/RO_0011002 (regulates activity of)
* [TarBase](https://dianalab.e-ce.uth.gr/html/diana/web/index.php?r=tarbasev8/index) <br />  DIANA-TarBase v8 is a reference database devoted to the indexing of experimentally supported microRNA (miRNA) targets.

In [None]:
data_downloader('https://dianalab.e-ce.uth.gr/downloads/tarbase_v8_data.tar.gz', unprocessed_data_location)

In [None]:
with tarfile.TarFile(unprocessed_data_location+'tarbase_v8_data.tar', 'r') as tar_ref:
    tar_ref.extractall(unprocessed_data_location)
    
miRNA_gene = pd.read_csv(unprocessed_data_location +
                          'TarBase_v8_download.txt', sep="\t",
                          dtype={"cell_line": "string"})  
miRNA_gene['Source(s)'] = 'TarBase|miRNet'

# For the time being, we keep only Homo sapiens rows
miRNA_gene = miRNA_gene[miRNA_gene['species'].str.contains("Homo sapiens")]
miRNA_gene.drop(columns=['geneId','species'], inplace=True)
miRNA_gene['geneName'] = miRNA_gene['geneName'].str.replace("\(hsa\)", '')
miRNA_gene.rename(columns={'mirna': 'miRNA'}, inplace=True)
miRNA_gene = pd.merge(symbol_entrez_map[['0_x','0_y']].rename(columns={'0_x': 'geneName','0_y': 'ENTREZID'}),
                       miRNA_gene, on='geneName')
miRNA_gene.drop(columns=['geneName'], inplace=True)
miRNA_gene

***
* [miRTarBase](https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/php/index.php) <br /> miRTarBase has accumulated more than three hundred and sixty thousand miRNA-target interactions (MTIs), which are collected by manually surveying pertinent literature after NLP of the text systematically to filter research articles related to functional studies of miRNAs.

In [None]:
!wget ~/RNA-KG/resources/processed_data/unprocessed_data/https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/9.0/miRTarBase_MTI.xlsx

In [None]:
miRNA_gene2 = pd.read_excel(unprocessed_data_location+"miRTarBase_MTI.xlsx")
miRNA_gene2 = miRNA_gene2[miRNA_gene2['Species (miRNA)'].str.contains('apiens')]
miRNA_gene2 = miRNA_gene2[miRNA_gene2['Species (Target Gene)'].str.contains('apiens')]
miRNA_gene2.drop(columns=['miRTarBase ID','Species (miRNA)','Target Gene','Species (Target Gene)'], inplace=True)
miRNA_gene2.rename(columns={'Target Gene (Entrez ID)': 'ENTREZID'}, inplace=True)
miRNA_gene2['Source(s)'] = 'miRTarBase|miRNet'
miRNA_gene2

In [None]:
miRNA_gene.ENTREZID = miRNA_gene.ENTREZID.astype('int64')

miRNA_gene = pd.merge(miRNA_gene, miRNA_gene2, how='outer', on=['miRNA','ENTREZID'])
miRNA_gene['Source(s)_x'] = miRNA_gene['Source(s)_x'].astype(str)
miRNA_gene['Source(s)_y'] = miRNA_gene['Source(s)_y'].astype(str)
miRNA_gene['Source(s)'] = miRNA_gene['Source(s)_x'] + '|' + miRNA_gene['Source(s)_y']
miRNA_gene['Source(s)'] = miRNA_gene['Source(s)'].str.replace(r'(\|miRNet){2}', '|miRNet')

miRNA_gene = miRNA_gene.drop(columns=['Source(s)_x', 'Source(s)_y'])
miRNA_gene

In [None]:
miRNA_gene['method'].fillna(miRNA_gene['Experiments'], inplace=True)
miRNA_gene.loc[(miRNA_gene["method"].notna()) &
               (miRNA_gene["Experiments"].notna()) &
               (miRNA_gene["method"] !=
                miRNA_gene["Experiments"]),
               ["method"]] = miRNA_gene["method"] + '|' + miRNA_gene["Experiments"]
miRNA_gene.drop(columns=['Experiments'],inplace=True)

miRNA_gene

***
* [TargetScan](https://www.targetscan.org/vert_80/) <br /> TargetScan predicts biological targets of miRNAs by searching for the presence of conserved 8mer, 7mer, and 6mer sites that match the seed region of each miRNA. 

In [None]:
!wget https://www.targetscan.org/vert_80/vert_80_data_download/Predicted_Targets_Context_Scores.default_predictions.txt.zip

In [None]:
miRNA_gene2 = pd.read_csv(unprocessed_data_location+'Predicted_Targets_Context_Scores.default_predictions.txt.zip',
                          sep='\t')
miRNA_gene2 = miRNA_gene2[miRNA_gene2['Gene Tax ID'] == 9606]
miRNA_gene2.drop(columns=['Gene ID','Transcript ID','Gene Tax ID'], inplace=True)
miRNA_gene2

In [None]:
miRNA_gene2 = pd.merge(miRNA_gene2, symbol_entrez_map.rename(columns={'0_x': 'Gene Symbol'}), on='Gene Symbol')
miRNA_gene2.drop(columns=['Gene Symbol'], inplace=True)
miRNA_gene2['Source(s)'] = 'TargetScan'

miRNA_gene2['0_y'] = miRNA_gene2['0_y'].astype('int64')

miRNA_gene = pd.merge(miRNA_gene, miRNA_gene2.rename(columns={'0_y':'ENTREZID'}), how='outer',
                      on=['miRNA','ENTREZID'])
miRNA_gene['Source(s)_x'] = miRNA_gene['Source(s)_x'].astype(str)
miRNA_gene['Source(s)_y'] = miRNA_gene['Source(s)_y'].astype(str)
miRNA_gene['Source(s)'] = miRNA_gene['Source(s)_x'] + '|' + miRNA_gene['Source(s)_y']
miRNA_gene = miRNA_gene.drop(columns=['Source(s)_x', 'Source(s)_y'])

miRNA_gene['Source(s)'] = miRNA_gene['Source(s)'].str.replace('nan\||\|nan', '', regex=True)
miRNA_gene

In the chunks below, we manually fix some inconsistency.

In [None]:
miRNA_gene['up_down'] = miRNA_gene['up_down'].replace('UNKNOWN', np.nan)
miRNA_gene['References (PMID)'] = 'https://pubmed.ncbi.nlm.nih.gov/' + miRNA_gene['References (PMID)'].astype('Int64').astype(str)
miRNA_gene['References (PMID)'] = miRNA_gene['References (PMID)'].replace('https://pubmed.ncbi.nlm.nih.gov/<NA>', np.nan)

- Cell line.

In [None]:
miRNA_gene['cell_line'].unique()[:5]

In [None]:
desc_clo_map2 = desc_clo_map.copy()
desc_clo_map2[1] = desc_clo_map2[1].str.replace(', ', '|http://purl.obolibrary.org/obo/')
desc_clo_map2[1] = 'http://purl.obolibrary.org/obo/' + desc_clo_map2[1] + ' (' + desc_clo_map2[0] + ')'

miRNA_gene['cellCleaned'] = miRNA_gene['cell_line'].str.lower().str.replace('cells', 'cell').str.replace(
    'lines', 'line').str.replace(r's$', '', regex=True)
miRNA_gene = pd.merge(miRNA_gene,
                     desc_clo_map2,
                     left_on=['cellCleaned'],
                     right_on=[0],
                     how='left')

miRNA_gene[1].fillna(miRNA_gene['cell_line'], inplace=True)
miRNA_gene.drop(columns=['cell_line'], inplace=True)
miRNA_gene.rename(columns={1: 'cell_line'}, inplace=True)
miRNA_gene.drop(columns=['cellCleaned',0], inplace=True)
miRNA_gene['cell_line'] = miRNA_gene['cell_line'].astype(str).replace('<NA>', np.nan)
miRNA_gene

- Method.

In [None]:
miRNA_gene['method'] = miRNA_gene['method'].str.replace('\\', '|')

miRNA_gene.method = miRNA_gene.method.str.lower()
miRNA_gene.method = miRNA_gene.method.str.replace("//", '|')
miRNA_gene.method = miRNA_gene.method.str.replace("/", '|')

miRNA_gene.method.unique()[:5]
ncit_dict = dict(zip(desc_ncit_map[0], 'http://purl.obolibrary.org/obo/' + desc_ncit_map[1] +
                      ' (' + desc_ncit_map[0] + ')'))

def replace_with_ncit(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([ncit_dict.get(part, part) for part in substring.split('|')])

miRNA_gene.method = [replace_with_ncit(item) for item in miRNA_gene.method]
miRNA_gene.method.unique()[:5]

- Tissue.

In [None]:
miRNA_gene.tissue = miRNA_gene.tissue.str.lower()
miRNA_gene.tissue = miRNA_gene.tissue.str.replace("/", '|')
miRNA_gene.tissue = miRNA_gene.tissue.str.replace("larva, whole", 'larva')

miRNA_gene.tissue.unique()[:5]

In [None]:
uberon_dict = dict(zip(desc_uberon_map[0], 'http://purl.obolibrary.org/obo/' + desc_uberon_map[1] +
                      ' (' + desc_uberon_map[0] + ')'))

def replace_with_uberon(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([uberon_dict.get(part, part) for part in substring.split('|')])

miRNA_gene.tissue = [replace_with_uberon(item) for item in miRNA_gene.tissue]
miRNA_gene.tissue.unique()[:5]

Let's divide miRNA-gene interactions into mature_miRNA-gene interactions and stem-loop_miRNA-gene interactions.

In [None]:
miRNA_gene = pd.merge(mirna_mirbase_map.rename(columns={2: 'miRNA'}), miRNA_gene, on='miRNA')
miRNA_gene.drop(columns=['miRNA'], inplace=True)

# move 'Source(s)' column to the end of the dataframe
miRNA_gene.insert(len(miRNA_gene.columns)-1, 'Source(s)', miRNA_gene.pop('Source(s)'))

miRNA_gene

Does miRNA_gene contain only mature miRNA sequences?

In [None]:
all(miRNA_gene[0].str.startswith('MIMAT'))

In [None]:
#directly regulates - RO:0002578
miRNA_gene_direct = miRNA_gene[(miRNA_gene['direct_indirect'] == 'DIRECT') & (miRNA_gene['up_down'].isna()) &
                                           (miRNA_gene['positive_negative'].isna())]
#indirectly regulates - RO:0012012
miRNA_gene_indirect = miRNA_gene[(miRNA_gene['direct_indirect'] == 'INDIRECT') & (miRNA_gene['up_down'].isna()) &
                                             (miRNA_gene['positive_negative'].isna())]
#positively regulates - RO:0002213
miRNA_gene_up = miRNA_gene[((miRNA_gene['up_down'] == 'UP') | (miRNA_gene['positive_negative'] == 'POSITIVE')) &
                                 (miRNA_gene['direct_indirect'].isna())]
#negatively regulates - RO:0002212
miRNA_gene_down = miRNA_gene[((miRNA_gene['up_down'] == 'DOWN') | (miRNA_gene['positive_negative'] == 'NEGATIVE')) &
                                         (miRNA_gene['direct_indirect'].isna())]

#directly negatively regulates - RO:0002630
miRNA_gene_direct_down = miRNA_gene[((miRNA_gene['up_down'] == 'DOWN') | (miRNA_gene['positive_negative'] == 'NEGATIVE')) &
                                                (miRNA_gene['direct_indirect'] == 'DIRECT')]
#indirectly negatively regulates - RO:0002409
miRNA_gene_indirect_down = miRNA_gene[((miRNA_gene['up_down'] == 'DOWN') | (miRNA_gene['positive_negative'] == 'NEGATIVE')) &
                                                  (miRNA_gene['direct_indirect'] == 'INDIRECT')]
#directly positively regulates - RO:0002629
miRNA_gene_direct_up = miRNA_gene[((miRNA_gene['up_down'] == 'UP') | (miRNA_gene['positive_negative'] == 'POSITIVE')) &
                                              (miRNA_gene['direct_indirect'] == 'DIRECT')]
#indirectly positively regulates - RO:0002407
miRNA_gene_indirect_up = miRNA_gene[((miRNA_gene['up_down'] == 'UP') | (miRNA_gene['positive_negative'] == 'POSITIVE')) &
                                                (miRNA_gene['direct_indirect'] == 'INDIRECT')]
#regulates activity of - RO:0011002
miRNA_gene = miRNA_gene[(miRNA_gene['direct_indirect'].isna()) & (miRNA_gene['up_down'].isna()) &
                                    (miRNA_gene['positive_negative'].isna())]

In [None]:
# Check dataframes' emptiness
miRNA_gene_direct_empty = miRNA_gene_direct.empty
miRNA_gene_indirect_empty = miRNA_gene_indirect.empty
miRNA_gene_up_empty = miRNA_gene_up.empty
miRNA_gene_down_empty = miRNA_gene_down.empty
miRNA_gene_direct_down_empty = miRNA_gene_direct_down.empty
miRNA_gene_indirect_down_empty = miRNA_gene_indirect_down.empty
miRNA_gene_direct_up_empty = miRNA_gene_direct_up.empty
miRNA_gene_indirect_up_empty = miRNA_gene_indirect_up.empty
miRNA_gene_empty = miRNA_gene.empty

print(miRNA_gene_direct_empty, miRNA_gene_indirect_empty, miRNA_gene_up_empty, miRNA_gene_down_empty,
      miRNA_gene_direct_down_empty, miRNA_gene_indirect_down_empty, miRNA_gene_direct_up_empty,
      miRNA_gene_indirect_up_empty, miRNA_gene_empty)

In [None]:
# Method to santize rows before writing to file (we do that so each line contains a unique relationships)
def merge_rows(df, column1, column2):
    df = df.drop_duplicates()
    df_merged = df.groupby([column1, column2]).agg(lambda x: '|'.join(set(str(i) for i in x if pd.notnull(i)))).reset_index()
    return df_merged.drop_duplicates()

In [None]:
# We can drop 'up_down', 'positive_negative', and 'direct_indirect' columns since we already used them to distinguish relationships
miRNA_gene.drop(columns=['up_down', 'positive_negative', 'direct_indirect'], inplace=True)
miRNA_gene_direct_down.drop(columns=['up_down', 'positive_negative', 'direct_indirect'], inplace=True)
miRNA_gene_indirect_down.drop(columns=['up_down', 'positive_negative', 'direct_indirect'], inplace=True)
miRNA_gene_direct_up.drop(columns=['up_down', 'positive_negative', 'direct_indirect'], inplace=True)
miRNA_gene_indirect_up.drop(columns=['up_down', 'positive_negative', 'direct_indirect'], inplace=True)

We properly rename columns before saving files.

In [None]:
miRNA_gene.columns

In [None]:
miRNA_gene.rename(columns={0:'miRNA', 'ENTREZID':'Gene', 'cell_line': 'Cell line', 'tissue': 'Tissue', 'category':'Category',
                           'method':'Method', 'condition':'Condition', 'Site Type': 'Site type', 'UTR_start': 'UTR start',
                           'context++ score': 'Context++ score', 'context++ score percentile': 'Context++ score percentile',
                           'weighted context++ score': 'Weighted context++ score', 'weighted context++ score percentile':
                           'Weighted context++ score percentile'}, inplace=True)
miRNA_gene_direct_down.rename(columns={0:'miRNA', 'ENTREZID':'Gene', 'cell_line': 'Cell line', 'tissue': 'Tissue', 'category':'Category',
                           'method':'Method', 'condition':'Condition', 'Site Type': 'Site type', 'UTR_start': 'UTR start',
                           'context++ score': 'Context++ score', 'context++ score percentile': 'Context++ score percentile',
                           'weighted context++ score': 'Weighted context++ score', 'weighted context++ score percentile':
                           'Weighted context++ score percentile'}, inplace=True)
miRNA_gene_indirect_down.rename(columns={0:'miRNA', 'ENTREZID':'Gene', 'cell_line': 'Cell line', 'tissue': 'Tissue', 'category':'Category',
                           'method':'Method', 'condition':'Condition', 'Site Type': 'Site type', 'UTR_start': 'UTR start',
                           'context++ score': 'Context++ score', 'context++ score percentile': 'Context++ score percentile',
                           'weighted context++ score': 'Weighted context++ score', 'weighted context++ score percentile':
                           'Weighted context++ score percentile'}, inplace=True)
miRNA_gene_direct_up.rename(columns={0:'miRNA', 'ENTREZID':'Gene', 'cell_line': 'Cell line', 'tissue': 'Tissue', 'category':'Category',
                           'method':'Method', 'condition':'Condition', 'Site Type': 'Site type', 'UTR_start': 'UTR start',
                           'context++ score': 'Context++ score', 'context++ score percentile': 'Context++ score percentile',
                           'weighted context++ score': 'Weighted context++ score', 'weighted context++ score percentile':
                           'Weighted context++ score percentile'}, inplace=True)
miRNA_gene_indirect_up.rename(columns={0:'miRNA', 'ENTREZID':'Gene', 'cell_line': 'Cell line', 'tissue': 'Tissue', 'category':'Category',
                           'method':'Method', 'condition':'Condition', 'Site Type': 'Site type', 'UTR_start': 'UTR start',
                           'context++ score': 'Context++ score', 'context++ score percentile': 'Context++ score percentile',
                           'weighted context++ score': 'Weighted context++ score', 'weighted context++ score percentile':
                           'Weighted context++ score percentile'}, inplace=True)

In [None]:
merge_rows(miRNA_gene, 'miRNA', 'Gene').to_csv(
    edge_data_location + 'RmiRNA-gene11002.txt', sep='\t', index=None)
merge_rows(miRNA_gene_direct_down, 'miRNA', 'Gene').to_csv(
    edge_data_location + 'RmiRNA-gene2449.txt', sep='\t', index=None)
merge_rows(miRNA_gene_indirect_down, 'miRNA', 'Gene').to_csv(
    edge_data_location + 'RmiRNA-gene11016.txt', sep='\t', index=None)
merge_rows(miRNA_gene_direct_up, 'miRNA', 'Gene').to_csv(
    edge_data_location + 'RmiRNA-gene2450.txt', sep='\t', index=None)
merge_rows(miRNA_gene_indirect_up, 'miRNA', 'Gene').to_csv(
    edge_data_location + 'RmiRNA-gene11013.txt', sep='\t', index=None)

***
### miRNA-mRNA - http://purl.obolibrary.org/obo/RO_0011002 (regulates activity of)
* [miRDB](https://mirdb.org/index.html) <br />  miRDB is an online database for miRNA target prediction and functional annotations. All the targets in miRDB were predicted by a bioinformatics tool, MirTarget, which was developed by analyzing thousands of miRNA-target interactions from high-throughput sequencing experiments.

In [None]:
data_downloader('https://mirdb.org/download/miRDB_v6.0_prediction_result.txt.gz', unprocessed_data_location)

In [None]:
miRNA_mRNA = pd.read_csv(unprocessed_data_location+'miRDB_v6.0_prediction_result.txt', sep='\t', names=['miRNA', 'mRNA', 'score'])
miRNA_mRNA['Source(s)'] = 'miRDB'
# For the time being, we keep only Homo sapiens rows
miRNA_mRNA = miRNA_mRNA[miRNA_mRNA['miRNA'].str.startswith("hsa")]

# From miRDB:
# All the predicted targets have target prediction scores between 50 - 100.
# These scores are assigned by the new computational target prediction algorithm.
# The higher the score, the more confidence we have in this prediction.
# That is why the search result is ordered by prediction score.
# In our experience, a predicted target with prediction score > 80 is most likely to be real.
# If the score is below 60, you need to be cautious and it is recommended to have other supporting evidence as well.  
miRNA_mRNA = miRNA_mRNA[miRNA_mRNA['score']>80]
miRNA_mRNA

***
* [miRecords](http://c1.accurascience.com/miRecords/download_data.php?v=4) <br />  miRecords is a resource for animal miRNA-target interactions.

In [None]:
!wget http://c1.accurascience.com/miRecords/download_data.php?v=4

In [None]:
miRNA_mRNA2 = pd.read_excel(unprocessed_data_location+"miRecords_version4.xls") 
miRNA_mRNA2['Source(s)'] = 'miRecords|miRNet'

# For the time being, we keep only Homo sapiens rows
miRNA_mRNA2 = miRNA_mRNA2[miRNA_mRNA2['Target gene_species_scientific'].str.contains("apiens")]
miRNA_mRNA2 = miRNA_mRNA2[miRNA_mRNA2['miRNA_species'].str.contains("apiens")]
miRNA_mRNA2['Target gene_Refseq_acc']= miRNA_mRNA2['Target gene_Refseq_acc'].str.split(".").str[0]
miRNA_mRNA2.rename(columns={'Target gene_Refseq_acc': 'mRNA', 'miRNA_mature_ID': 'miRNA'}, inplace=True)
miRNA_mRNA2.drop(columns=['Target gene_species_scientific','Target gene_name','miRNA_species'], inplace=True)

miRNA_mRNA2

In [None]:
mRNA_miRNA = pd.merge(miRNA_mRNA, miRNA_mRNA2, how='outer', on=['mRNA', 'miRNA'])
mRNA_miRNA

In [None]:
mRNA_miRNA['Source(s)_x'] = mRNA_miRNA['Source(s)_x'].astype(str)
mRNA_miRNA['Source(s)_y'] = mRNA_miRNA['Source(s)_y'].astype(str)
mRNA_miRNA['Source(s)'] = mRNA_miRNA['Source(s)_x'] + '|' + mRNA_miRNA['Source(s)_y']
mRNA_miRNA = mRNA_miRNA.drop(columns=['Source(s)_x', 'Source(s)_y'])
mRNA_miRNA

In [None]:
mRNA_miRNA[['mRNA']].to_csv(unprocessed_data_location + 'mRNA.txt', header=None, index=None)

In [None]:
mRNA = pd.read_csv(unprocessed_data_location+'mRNA.csv')
mRNA.rename(columns={'ACCNUM': 'mRNA'}, inplace=True)
mRNA

In [None]:
mRNA_miRNA = pd.merge(mRNA_miRNA, mRNA, on=['mRNA'])
mRNA_miRNA.insert(1, 'ENTREZID', mRNA_miRNA.pop("ENTREZID"))
mRNA_miRNA

***
* [SomamiR](https://compbio.uthsc.edu/SomamiR/) <br /> SomamiR is a database of cancer somatic mutations in microRNAs (miRNA) and their target sites that potentially alter the interactions between miRNAs and competing endogenous RNAs (ceRNA) including mRNAs, circular RNAs (circRNA) and long noncoding RNAs (lncRNA).

In [None]:
!wget https://compbio.uthsc.edu/SomamiR/download/predicted_mRNA_targets_somamir_v2.0.txt.tar.gz

In [None]:
mRNA_miRNA2 = pd.read_csv(unprocessed_data_location+'predicted_mRNA_targets_somamir_v2.0.txt.tar.gz',sep='\t')
mRNA_miRNA2.drop(columns=['Chromosome','strand','Organisms'],inplace=True)
# We select only relationships validated by TargetScan
mRNA_miRNA2 = mRNA_miRNA2[mRNA_miRNA2['Targetscan']==1] 
mRNA_miRNA2.drop(columns=['Targetscan'],inplace=True)
mRNA_miRNA2 = pd.merge(mRNA_miRNA2, symbol_entrez_map.rename(columns={'0_x':'Genesymbol'}), on='Genesymbol')
mRNA_miRNA2.drop(columns=['Genesymbol'],inplace=True)
mRNA_miRNA2.rename(columns={'0_y':'ENTREZID'},inplace=True)
mRNA_miRNA2['Source(s)'] = 'SomamiR'
mRNA_miRNA2

In [None]:
mRNA_miRNA= pd.merge(mRNA_miRNA, mRNA_miRNA2, how='outer', on=['miRNA','ENTREZID'])

mRNA_miRNA['Pubmed_id'].fillna(mRNA_miRNA['Pubmedid'], inplace=True)
mRNA_miRNA['Pubmed_id'] = mRNA_miRNA['Pubmed_id'].astype(str)
mRNA_miRNA['Pubmedid'] = mRNA_miRNA['Pubmedid'].astype(str)
mRNA_miRNA.loc[(mRNA_miRNA["Pubmed_id"]!="<NA>") &
               (mRNA_miRNA["Pubmedid"].notna()) &
               (mRNA_miRNA["Pubmed_id"] !=
                mRNA_miRNA["Pubmedid"]),
               ["Pubmed_id"]] = mRNA_miRNA["Pubmed_id"] + '|' + mRNA_miRNA["Pubmedid"]
mRNA_miRNA.drop(columns=['Pubmedid'],inplace=True)

mRNA_miRNA['Source(s)_x'] = mRNA_miRNA['Source(s)_x'].astype(str)
mRNA_miRNA['Source(s)_y'] = mRNA_miRNA['Source(s)_y'].astype(str)
mRNA_miRNA['Source(s)'] = mRNA_miRNA['Source(s)_x'] + '|' + mRNA_miRNA['Source(s)_y']
mRNA_miRNA = mRNA_miRNA.drop(columns=['Source(s)_x', 'Source(s)_y'])

mRNA_miRNA

***
* [miRdSNP](http://mirdsnp.ccr.buffalo.edu/index.php) <br /> miRdSNP is a database of disease-associated SNPs and microRNA target sites on 3'UTRs of human genes.

In [None]:
!wget http://mirdsnp.ccr.buffalo.edu/downloads/mirdsnp-dsnp-generated-mir-targets-v11.03.csv

In [None]:
mRNA_miRNA2 = pd.read_csv(unprocessed_data_location+'mirdsnp-dsnp-generated-mir-targets-v11.03.csv')
#mRNA_miRNA2 = mRNA_miRNA2[mRNA_miRNA2['experimentally_confirmed']=='Yes']
mRNA_miRNA2.drop(columns=['experimentally_confirmed'],inplace=True)
mRNA_miRNA2 = pd.merge(mRNA_miRNA2, symbol_entrez_map.rename(columns={'0_x':'gene_name'}), on='gene_name')
mRNA_miRNA2.drop(columns=['gene_name'],inplace=True)
mRNA_miRNA2.rename(columns={'0_y':'ENTREZID','miR':'miRNA'},inplace=True)
mRNA_miRNA2['Source(s)'] = 'miRdSNP'
mRNA_miRNA2

In [None]:
mRNA_miRNA= pd.merge(mRNA_miRNA, mRNA_miRNA2, how='outer', on=['miRNA','ENTREZID'])

mRNA_miRNA['diseases'].fillna(mRNA_miRNA['Cancertype'], inplace=True)
mRNA_miRNA.loc[(mRNA_miRNA["diseases"].notna()) &
               (mRNA_miRNA["Cancertype"].notna()) &
               (mRNA_miRNA["diseases"] !=
                mRNA_miRNA["Cancertype"]),
               ["diseases"]] = mRNA_miRNA["diseases"] + '|' + mRNA_miRNA["Cancertype"]
mRNA_miRNA.drop(columns=['Cancertype'],inplace=True)

mRNA_miRNA['Refseq'].fillna(mRNA_miRNA['refseq_id'], inplace=True)
mRNA_miRNA.drop(columns=['refseq_id'],inplace=True)

mRNA_miRNA['Source(s)_x'] = mRNA_miRNA['Source(s)_x'].astype(str)
mRNA_miRNA['Source(s)_y'] = mRNA_miRNA['Source(s)_y'].astype(str)
mRNA_miRNA['Source(s)'] = mRNA_miRNA['Source(s)_x'] + '|' + mRNA_miRNA['Source(s)_y']
mRNA_miRNA = mRNA_miRNA.drop(columns=['Source(s)_x', 'Source(s)_y'])

mRNA_miRNA

In the chunks below, we manually fix some inconsistency.

In [None]:
for column in ['miRNA_regulation', 'Reporter_target gene/region',
       'Reporter link element', 'Test_method_inter', 'Target gene mRNA_level',
       'Original description', 'Mutation_target region',
       'Post mutation_method', 'Original description_mutation_region',
       'miRNA_regulation_site', 'Reporter_target site',
       'Reporter link element.1', 'Test_method_inter_site',
       'Original description_inter_site', 'Mutation_target site',
       'Post mutation_method_site', 'Original description_mutation_site',
       'Additional note']:
    mRNA_miRNA[column] = mRNA_miRNA[column].str.replace('}{', '|', regex=True)
    mRNA_miRNA[column] = mRNA_miRNA[column].str.replace('||', '|', regex=True)
    mRNA_miRNA[column] = mRNA_miRNA[column].replace('|', np.nan)

mRNA_miRNA['Target site_position'] = mRNA_miRNA['Target site_position'].replace('unknown', np.nan)
mRNA_miRNA['miRNA_regulation'] = mRNA_miRNA['miRNA_regulation'].str.replace(
    'overexpression by siRNA transfection}{mutation', 'mutation}{overexpression by siRNA transfection', regex=True)
mRNA_miRNA['miRNA_regulation'] = mRNA_miRNA['miRNA_regulation'].str.replace(
    '|mutation|overexpression by siRNA transfection', '', regex=True)

mRNA_miRNA['Reporter link element'] = mRNA_miRNA['Reporter link element'].str.replace('}{', '|', regex=True)
mRNA_miRNA['Reporter link element'] = mRNA_miRNA['Reporter link element'].str.replace('{', '', regex=True)
mRNA_miRNA['Reporter link element'] = mRNA_miRNA['Reporter link element'].str.replace('}', '', regex=True)

mRNA_miRNA['Test_method_inter_site'] = mRNA_miRNA['Test_method_inter_site'].str.replace(
    '{activity assay}{activity assay}', '{activity assay}', regex=True)
mRNA_miRNA['Test_method_inter_site'] = mRNA_miRNA['Test_method_inter_site'].replace('{N/A}', np.nan)
mRNA_miRNA['Post mutation_method_site'] = mRNA_miRNA['Post mutation_method_site'].str.replace(
    '{Luciferase activity assay}{Luciferase activity assay}', '{Luciferase activity assay}', regex=True)

mRNA_miRNA.loc[(mRNA_miRNA["mRNA"].notna()) &
                     (mRNA_miRNA["Refseq"].notna()) &
                     (mRNA_miRNA["mRNA"] != mRNA_miRNA["Refseq"]),
                     ["mRNA"]] = mRNA_miRNA["mRNA"] + '|' + mRNA_miRNA["Refseq"]
mRNA_miRNA = mRNA_miRNA.drop(columns=['Refseq'])

mRNA_miRNA['U_mut_id'] = "https://cancer.sanger.ac.uk/cosmic/search?q=" + mRNA_miRNA['U_mut_id'].astype(str)
mRNA_miRNA['U_mut_id'] = mRNA_miRNA['U_mut_id'].replace("https://cancer.sanger.ac.uk/cosmic/search?q=nan", np.nan)

mRNA_miRNA['SNP'] = "https://www.ncbi.nlm.nih.gov/snp/" + mRNA_miRNA['SNP'].astype(str)
mRNA_miRNA['SNP'] = mRNA_miRNA['SNP'].replace("https://www.ncbi.nlm.nih.gov/snp/nan", np.nan)

mRNA_miRNA['Pubmed_id'] = mRNA_miRNA['Pubmed_id'].astype(str).str.replace('nan\||\|nan', '', regex=True)
mRNA_miRNA['Pubmed_id'] = mRNA_miRNA['Pubmed_id'].astype(str).str.replace('.0', '', regex=True)
mRNA_miRNA['Pubmed_id'] = 'https://pubmed.ncbi.nlm.nih.gov/' + mRNA_miRNA['Pubmed_id']
mRNA_miRNA['Pubmed_id'] = mRNA_miRNA['Pubmed_id'].replace('|', '|https://pubmed.ncbi.nlm.nih.gov/')
mRNA_miRNA['Pubmed_id'] = mRNA_miRNA['Pubmed_id'].replace('https://pubmed.ncbi.nlm.nih.gov/ICGC', 'ICGC')
mRNA_miRNA['Pubmed_id'] = mRNA_miRNA['Pubmed_id'].replace('https://pubmed.ncbi.nlm.nih.gov/PCGP', 'PCGP')
mRNA_miRNA['Pubmed_id'] = mRNA_miRNA['Pubmed_id'].replace('https://pubmed.ncbi.nlm.nih.gov/nan', np.nan)

mRNA_miRNA['Source(s)'] = mRNA_miRNA['Source(s)'].str.replace('nan\||\|nan', '', regex=True)

- Disease.

In [None]:
mRNA_miRNA.diseases = mRNA_miRNA.diseases.str.lower()
mRNA_miRNA.diseases = mRNA_miRNA.diseases.str.replace("[ns]", ']')
mRNA_miRNA.diseases = mRNA_miRNA.diseases.str.replace("]", '|')
mRNA_miRNA.diseases = mRNA_miRNA.diseases.str.replace("[", '')
mRNA_miRNA.diseases = mRNA_miRNA.diseases.str.replace(", ", '|')
mRNA_miRNA.diseases = mRNA_miRNA.diseases.str.replace("||", '|')
mRNA_miRNA.diseases = mRNA_miRNA.diseases.replace("|||", np.nan)
mRNA_miRNA.diseases = mRNA_miRNA.diseases.str.replace('_', ' ')
mRNA_miRNA.diseases = mRNA_miRNA.diseases.str.replace('lung/sclc', 'small cell lung carcinoma')
mRNA_miRNA.diseases = mRNA_miRNA.diseases.replace(r'\(.*?\)', '', regex=True)
mRNA_miRNA.diseases = mRNA_miRNA.diseases.replace(r'\|$', '', regex=True)
mRNA_miRNA.diseases.unique()[:5]

In [None]:
mondo_dict = dict(zip(desc_disPhe_map[0], 'http://purl.obolibrary.org/obo/' + desc_disPhe_map[1] +
                      ' (' + desc_disPhe_map[0] + ')'))

def replace_with_mondo(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([mondo_dict.get(part, part) for part in substring.split('|')])

mRNA_miRNA.diseases = [replace_with_mondo(item) for item in mRNA_miRNA.diseases]
mRNA_miRNA.diseases.unique()

In [None]:
mRNA_miRNA.columns = mRNA_miRNA.columns.str.replace('_', ' ')
mRNA_miRNA = mRNA_miRNA.rename(columns={'mRNA': 'Transcript', 'ENTREZID': 'mRNA', 'score': 'Score', 'Pubmed id': 'References (PMID)',
                                        'Mutationid':'Mutation ID', 'Mutationlocation': 'Mutation location', 'Mutantallele': 'Mutant allele',
                                        'Targetsiteclass':'Target site class', 'mRNAseq': 'mRNA sequence (binding in capital letters)',
                                        'wildtype csp':'Wildtype csp', 'mutant csp':'Mutant csp', 'display first':'Display first',
                                        'pita ref':'Pita ref', 'pita mut':'Pita mut', 'pita diff':'Pita diff', 'diseases':'Disease(s)',
                                        'distance': 'Distance'})
mRNA_miRNA.columns

Let's divide miRNA-mRNA interactions into mature_miRNA-mRNA interactions and stem-loop_miRNA-mRNA interactions.

In [None]:
mRNA_miRNA = pd.merge(mirna_mirbase_map.rename(columns={2: 'miRNA'}), mRNA_miRNA, on='miRNA')
mRNA_miRNA.drop(columns=['miRNA'], inplace=True)

mRNA_miRNA = mRNA_miRNA.rename(columns={0:'miRNA'})

# move 'Source(s)' column to the end of the dataframe
mRNA_miRNA.insert(len(mRNA_miRNA.columns)-1, 'Source(s)', mRNA_miRNA.pop('Source(s)'))

mRNA_miRNA

In [None]:
mRNA_miRNA['mRNA'] = mRNA_miRNA['mRNA'].astype('str') + '?mRNA'
maturemRNA_miRNA = mRNA_miRNA[mRNA_miRNA['miRNA'].str.startswith('MIMAT')]
premRNA_miRNA = mRNA_miRNA[~mRNA_miRNA['miRNA'].str.startswith('MIMAT')]

In [None]:
merge_rows(premRNA_miRNA, 'miRNA', 'mRNA').to_csv(
    edge_data_location + 'RpremiRNA-mRNA11002.txt', sep='\t', index=None)

merge_rows(maturemRNA_miRNA, 'miRNA', 'mRNA').to_csv(
    edge_data_location + 'RmiRNA-mRNA11002.txt', sep='\t', index=None)

***
### miRNA-pseudogene - http://purl.obolibrary.org/obo/RO_0011002 (regulates activity of)

* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
! wget https://www.dropbox.com/s/r01ppq5x42v4lyh/miRNet-mir-pseudogene.csv?dl=0

In [None]:
miRNA_pseudogene = pd.read_csv(unprocessed_data_location+'miRNet-mir-pseudogene.csv?dl=0')
miRNA_pseudogene.drop(columns=['mirnet','mir_id','symbol','embl','gene_name','mbv'], inplace=True)
miRNA_pseudogene.rename(columns={'mir_acc':'miRNA','entrez':'Pseudogene'}, inplace=True)
miRNA_pseudogene['Source(s)'] = 'miRNet'
miRNA_pseudogene

In [None]:
# Does miRNet contain premiRNA-pseudogene interactions?
miRNA_pseudogene[~miRNA_pseudogene['miRNA'].str.startswith('MIMAT')].any()

In [None]:
miRNA_pseudogene.drop_duplicates().to_csv(
    edge_data_location+'RmiRNA-pseudogene.txt', sep='\t', index=None)

***
### miRNA-epigenetic modification - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [EpimiR](http://www.jianglab.cn/EpimiR/index.jsp) <br />
The EpimiR database have obtained 1974 regulatory relationships between 19 types of epigenetic modifications (including DNA methylation, histone acetylation, H3K4me3 and H3K27me3, etc.) and 617 miRNAs across 7 species (including Homo sapiens) from nearly 2000 literatures.

In [None]:
#via miRNet --> https://www.dropbox.com/s/p852ndpck5jasxz/miRNet-mir-epi-hsa.csv?dl=0
miRNA_epiMod = pd.read_csv(unprocessed_data_location + 'miRNet-mir-epi-hsa.csv')
miRNA_epiMod.drop(columns=['mirnet','mir_id','note','res_type','year'], inplace=True)
miRNA_epiMod['epi_modification'] = miRNA_epiMod.epi_modification.str.split('/')
miRNA_epiMod = miRNA_epiMod.explode('epi_modification')
miRNA_epiMod.insert(1, 'epi_modification', miRNA_epiMod.pop("epi_modification"))
miRNA_epiMod['Source(s)'] = 'EpimiR|miRNet'
miRNA_epiMod

In [None]:
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.replace('DNA Methylation','GO_0006306')
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.replace('Histone Acetylation','GO_0016573')
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.replace('Histone Modification','GO_0016570')
miRNA_epiMod['epi_modification'].unique()

Manually fix inconsistencies.

- Experiment.

In [None]:
miRNA_epiMod.experiment.unique()

In [None]:
miRNA_epiMod.experiment = miRNA_epiMod.experiment.str.replace("in vivo ", 'in vivo')
miRNA_epiMod.experiment = miRNA_epiMod.experiment.str.replace("/vivo", '|in vivo')
miRNA_epiMod.experiment = miRNA_epiMod.experiment.str.replace("/", '|')
miRNA_epiMod.experiment = miRNA_epiMod.experiment.str.replace(".", '|')
miRNA_epiMod.experiment = miRNA_epiMod.experiment.str.replace("in vitro|in vivo", 'in vivo|in vitro')
miRNA_epiMod.experiment = miRNA_epiMod.experiment.str.replace(
    "in vitro",'http://purl.obolibrary.org/obo/NCIT_C15263 (in vitro)')
miRNA_epiMod.experiment = miRNA_epiMod.experiment.str.replace(
    "in vivo", 'http://purl.obolibrary.org/obo/NCIT_C15744 (in vivo)')
miRNA_epiMod.experiment.unique()

- Epigenetic regulator.

In [None]:
miRNA_epiMod.epi_regulator.unique()

In [None]:
miRNA_epiMod.epi_regulator = miRNA_epiMod.epi_regulator.str.replace("/", '|')
miRNA_epiMod.epi_regulator.unique()

- Expression

In [None]:
miRNA_epiMod.expression.unique()

In [None]:
miRNA_epiMod.expression = miRNA_epiMod.expression.str.replace(
    "low",'http://purl.obolibrary.org/obo/NCIT_C177694 (decreased expression)')
miRNA_epiMod.expression = miRNA_epiMod.expression.str.replace(
    "high",'http://purl.obolibrary.org/obo/NCIT_C177693 (elevated expression)')
miRNA_epiMod.expression.unique()

- Epi target.

In [None]:
miRNA_epiMod.epi_target.unique()[:5]

In [None]:
miRNA_epiMod.epi_target = miRNA_epiMod.epi_target.str.lower()
miRNA_epiMod.epi_target = miRNA_epiMod.epi_target.str.replace("/", '|')
miRNA_epiMod.epi_target = miRNA_epiMod.epi_target.str.replace(" cluster", '')
miRNA_epiMod.epi_target = miRNA_epiMod.epi_target.replace("\\\\\\\\N", np.nan)
miRNA_epiMod.epi_target = miRNA_epiMod.epi_target.replace(r'\(.*?\)', '', regex=True)
miRNA_epiMod.epi_target = miRNA_epiMod.epi_target.replace(r' \(.*?\)', '', regex=True)

pro_dict = dict(zip(desc_pro_map[0], 'http://purl.obolibrary.org/obo/' + desc_pro_map[1] +
                   ' (' + desc_pro_map[0] + ')'))

def replace_with_pro(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([pro_dict.get(part, part) for part in substring.split('|')])

miRNA_epiMod.epi_target = [replace_with_pro(item) for item in miRNA_epiMod.epi_target]
miRNA_epiMod.epi_target.unique()[:5]

- Detection method.

In [None]:
miRNA_epiMod.detect.unique()

In [None]:
miRNA_epiMod.detect = miRNA_epiMod.detect.str.replace('(promoter+inhibitor)microarray',
                                                      'promoter+inhibitor+microarray')
miRNA_epiMod.detect = miRNA_epiMod.detect.str.replace('microarray',
                                                      'http://purl.obolibrary.org/obo/NCIT_C44282 (microarray)')
miRNA_epiMod.detect = miRNA_epiMod.detect.str.replace('promoter',
                                                      'http://purl.obolibrary.org/obo/NCIT_C13297 (promoter)')
miRNA_epiMod.detect = miRNA_epiMod.detect.str.replace('inhibitor',
                                                      'http://purl.obolibrary.org/obo/NCIT_C154898 (inhibitor)')
miRNA_epiMod.detect = miRNA_epiMod.detect.str.replace('expression',
                                                      'http://purl.obolibrary.org/obo/NCIT_C80488 (expression)')
miRNA_epiMod.detect = miRNA_epiMod.detect.str.replace('target',
                                                      'http://purl.obolibrary.org/obo/NCIT_C25702 (target)')
miRNA_epiMod.detect = miRNA_epiMod.detect.str.replace('transfection',
                                                      'http://purl.obolibrary.org/obo/NCIT_C17209 (transfection)')
miRNA_epiMod.detect = miRNA_epiMod.detect.str.replace('methylation',
                                                      'http://purl.obolibrary.org/obo/NCIT_C16848 (methylation)')
miRNA_epiMod.detect.unique()[:5]

- Condition.

In [None]:
miRNA_epiMod.condition.unique()[:5]

In [None]:
miRNA_epiMod.condition = miRNA_epiMod.condition.str.replace('non-small-cell lung cancer', 'non-small cell lung carcinoma')
miRNA_epiMod.condition = [replace_with_mondo(item) for item in miRNA_epiMod.condition]
miRNA_epiMod.condition.unique()[:5]

- References

In [None]:

miRNA_epiMod['pmid'] = 'https://pubmed.ncbi.nlm.nih.gov/' + miRNA_epiMod['pmid'].astype(str)
miRNA_epiMod['pmid'] = miRNA_epiMod['pmid'].replace('https://pubmed.ncbi.nlm.nih.gov/nan', np.nan)

In [None]:
miRNA_epiMod.rename(columns={'mir_acc':'miRNA', 'epi_modification':'Epigenetic modification', 'epi_regulator':'Epigenetic regulator',
                             'experiment':'Experiment', 'expression':'Expression', 'pmid':'References (PMID)', 'epi_target':'Epigenetic target',
                             'condition':'Condition', 'detect':'Detect', 'support':'Support'}, inplace=True)

Let's divide miRNA-epiMod interactions into mature_miRNA-epiMod interactions and stem-loop_miRNA-epiMod interactions. Furthermore, let's divide GO classes from entities.

In [None]:
maturemiRNA_epiMod = miRNA_epiMod[miRNA_epiMod['miRNA'].str.startswith('MIMAT')]
premiRNA_epiMod = miRNA_epiMod[miRNA_epiMod['miRNA'].str.startswith('MI') &
                               (~miRNA_epiMod['miRNA'].str.startswith('MIMAT'))]

In [None]:
maturemiRNA_epiMod_class = maturemiRNA_epiMod[maturemiRNA_epiMod['Epigenetic modification'].str.startswith('GO')]
premiRNA_epiMod_class = premiRNA_epiMod[premiRNA_epiMod['Epigenetic modification'].str.startswith('GO')]

maturemiRNA_epiMod_class.drop_duplicates().to_csv(
    edge_data_location + 'RmiRNA-epiModclass.txt', header=None, sep='\t', index=None)
premiRNA_epiMod_class.drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-epiModclass.txt', header=None, sep='\t', index=None)

In [None]:
maturemiRNA_epiMod = maturemiRNA_epiMod[~maturemiRNA_epiMod['Epigenetic modification'].str.startswith('GO')]
premiRNA_epiMod = premiRNA_epiMod[~premiRNA_epiMod['Epigenetic modification'].str.startswith('GO')]

maturemiRNA_epiMod.drop_duplicates().to_csv(
    edge_data_location + 'RmiRNA-epiMod.txt', sep='\t', index=None)
premiRNA_epiMod.drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-epiMod.txt', sep='\t', index=None)

***
### miRNA-disease - http://purl.obolibrary.org/obo/RO_0003302 (causes or contributes to condition)

* [miR2Disease](http://watson.compbio.iupui.edu:8080/miR2Disease/) <br />miR2Disease is a manually curated database that aims at providing a comprehensive resource of miRNA deregulation in various human diseases.

In [None]:
data_downloader('http://watson.compbio.iupui.edu:8080/miR2Disease/download/AllEntries.txt', unprocessed_data_location)

In [None]:
miRNA_disease = pd.read_csv(unprocessed_data_location + 'AllEntries.txt', sep="\t", header=None) 
miRNA_disease[1] = miRNA_disease[1].str.lower()
miRNA_disease.rename(columns={0: 'mir_id', 1: 'disease'}, inplace=True)
miRNA_disease['Source(s)'] = 'miR2Disease'
miRNA_disease

***
* [HMDD](https://www.cuilab.cn/hmdd) <br /> HMDD (the Human microRNA Disease Database) is a database that curated experiment-supported evidence for human microRNA (miRNA) and disease associations. miRNAs are one class of important regulatory RNAs, which mainly repress gene express at the post-transcriptional level.

In [None]:
!wget https://www.cuilab.cn/static/hmdd3/data/alldata.xlsx

In [None]:
miRNA_disease2 = pd.read_excel(unprocessed_data_location+'alldata.xlsx')
miRNA_disease2.rename(columns={'mir': 'mir_id'}, inplace=True)
miRNA_disease2.disease = miRNA_disease.disease.str.lower()
miRNA_disease2['Source(s)'] = 'HMDD' 
miRNA_disease2

In [None]:
miRNA_disease = pd.merge(miRNA_disease,miRNA_disease2,how='outer',on=['mir_id','disease'])

miRNA_disease['description'].fillna(miRNA_disease[5], inplace=True)
miRNA_disease.loc[(miRNA_disease['description'].notna()) & (miRNA_disease[5].notna()) &
                  (miRNA_disease['description'] != miRNA_disease[5]),
          ["description"]] = miRNA_disease["description"] + '|' + miRNA_disease[5]

miRNA_disease['category'].fillna(miRNA_disease[2], inplace=True)
miRNA_disease.loc[(miRNA_disease['category'].notna()) & (miRNA_disease[2].notna()) &
                  (miRNA_disease['category'] != miRNA_disease[2]),
          ["category"]] = miRNA_disease["category"] + '|' + miRNA_disease[2]

miRNA_disease['Source(s)_x'] = miRNA_disease['Source(s)_x'].astype(str)
miRNA_disease['Source(s)_y'] = miRNA_disease['Source(s)_y'].astype(str)
miRNA_disease['Source(s)'] = miRNA_disease['Source(s)_x'] + '|' + miRNA_disease['Source(s)_y']
miRNA_disease = miRNA_disease.drop(columns=['Source(s)_x', 'Source(s)_y'])

miRNA_disease.drop(columns=['category','description'],inplace=True)
miRNA_disease

***
* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
!wget https://www.dropbox.com/s/o27wz2kg9co76mo/miRNet-mir-disease.csv?dl=0

In [None]:
miRNA_disease2 = pd.read_csv(unprocessed_data_location + "miRNet-mir-disease.csv?dl=0")
miRNA_disease2.disease = miRNA_disease2.disease.str.lower()
miRNA_disease2 = miRNA_disease2[~miRNA_disease2['database'].str.contains("miR2Disease")]
miRNA_disease2 = miRNA_disease2.drop(columns=['database','mir_acc','mirnet'])
miRNA_disease2['Source(s)'] = 'miRNet'
miRNA_disease2

In [None]:
miRNA_disease = pd.merge(miRNA_disease,miRNA_disease2,how='outer',on=['mir_id','disease'])

miRNA_disease['evidence'].fillna(miRNA_disease[2], inplace=True)
miRNA_disease.loc[(miRNA_disease['evidence'].notna()) & (miRNA_disease[2].notna()) &
                  (miRNA_disease['evidence'] != miRNA_disease[2]),
          ["evidence"]] = miRNA_disease["evidence"] + '|' + miRNA_disease[2]

miRNA_disease['method'].fillna(miRNA_disease[3], inplace=True)
miRNA_disease.loc[(miRNA_disease['method'].notna()) & (miRNA_disease[3].notna()) &
                  (miRNA_disease['method'] != miRNA_disease[3]),
          ["method"]] = miRNA_disease["method"] + '|' + miRNA_disease[3]

miRNA_disease['Source(s)_x'] = miRNA_disease['Source(s)_x'].astype(str)
miRNA_disease['Source(s)_y'] = miRNA_disease['Source(s)_y'].astype(str)
miRNA_disease['Source(s)'] = miRNA_disease['Source(s)_x'] + '|' + miRNA_disease['Source(s)_y']
miRNA_disease = miRNA_disease.drop(columns=['Source(s)_x', 'Source(s)_y'])

miRNA_disease['pmid_x'].fillna(miRNA_disease['pmid_y'], inplace=True)
miRNA_disease['pmid_x'] = miRNA_disease['pmid_x'].astype(str)
miRNA_disease['pmid_y'] = miRNA_disease['pmid_y'].astype(str)
miRNA_disease.loc[(miRNA_disease['pmid_x']!="<NA>") & (miRNA_disease['pmid_y']!="<NA>") &
                  (miRNA_disease['pmid_x'] != miRNA_disease['pmid_y']),
                  ["pmid_x"]] = miRNA_disease["pmid_x"] + '|' + miRNA_disease['pmid_y']                

miRNA_disease.drop(columns=[2,3],inplace=True)
miRNA_disease

***
* [dbDEMC](https://www.biosino.org/dbDEMC/index) <br /> dbDEMC (database of Differentially Expressed MiRNAs in human Cancers) is an integrated database that designed to store and display differentially expressed microRNAs (miRNAs) in cancers.

In [None]:
! wget https://www.biosino.org/dbDEMC/download/MiRExpAll

In [None]:
miRNA_disease2 = pd.read_csv(unprocessed_data_location+"MiRExpAll", sep="\t")
miRNA_disease2 = miRNA_disease2[miRNA_disease2.Species.str.contains("apiens")]
miRNA_disease2 = miRNA_disease2.drop(columns=['miRNA_ID','ExperimentID','logFC','SourceDataID',
                                              'AveExpr','Tvalue','Pvalue','Bvalue','Species'])
miRNA_disease2 = miRNA_disease2[miRNA_disease2['adjPvalue']<.01]
miRNA_disease2.rename(columns={'miRBaseID': 'mir_id', 'CancerType': 'disease'}, inplace=True)
miRNA_disease2['Source(s)'] = 'dbDEMC'
miRNA_disease2

In [None]:
miRNA_disease = pd.merge(miRNA_disease,miRNA_disease2,how='outer',on=['mir_id','disease'])

miRNA_disease['evidence'].fillna(miRNA_disease['Status'], inplace=True)
miRNA_disease.loc[(miRNA_disease['evidence'].notna()) & (miRNA_disease['Status'].notna()) &
                  (miRNA_disease['evidence'] != miRNA_disease['Status']),
          ["evidence"]] = miRNA_disease["evidence"] + '|' + miRNA_disease['Status']

miRNA_disease.drop(columns=['Status'],inplace=True)

miRNA_disease['Source(s)_x'] = miRNA_disease['Source(s)_x'].astype(str)
miRNA_disease['Source(s)_y'] = miRNA_disease['Source(s)_y'].astype(str)
miRNA_disease['Source(s)'] = miRNA_disease['Source(s)_x'] + '|' + miRNA_disease['Source(s)_y']
miRNA_disease = miRNA_disease.drop(columns=['Source(s)_x', 'Source(s)_y'])

miRNA_disease

***
* [miRdSNP](http://mirdsnp.ccr.buffalo.edu/index.php) <br /> miRdSNP is a database of disease-associated SNPs and microRNA target sites on 3'UTRs of human genes.

In [None]:
! wget http://mirdsnp.ccr.buffalo.edu/downloads/mirdsnp-dsnp-generated-mir-targets-v11.03.csv

In [None]:
miRdSNP = pd.read_csv(unprocessed_data_location+'mirdsnp-dsnp-generated-mir-targets-v11.03.csv')
miRdSNP.diseases = miRdSNP.diseases.str.lower()
miRdSNP.rename(columns={'miR': 'mir_id', 'diseases': 'disease'}, inplace=True)
miRdSNP['Source(s)'] = 'miRdSNP'
miRdSNP

In [None]:
miRNA_disease = pd.merge(miRNA_disease,miRdSNP,how='outer',on=['mir_id','disease'])

miRNA_disease['method'].fillna(miRNA_disease['experimentally_confirmed'], inplace=True)
miRNA_disease.loc[(miRNA_disease['method'].notna()) & (miRNA_disease['experimentally_confirmed'].notna()) &
                  (miRNA_disease['method'] != miRNA_disease['experimentally_confirmed']),
          ["method"]] = miRNA_disease["method"] + '|' + miRNA_disease['experimentally_confirmed']

miRNA_disease['Source(s)_x'] = miRNA_disease['Source(s)_x'].astype(str)
miRNA_disease['Source(s)_y'] = miRNA_disease['Source(s)_y'].astype(str)
miRNA_disease['Source(s)'] = miRNA_disease['Source(s)_x'] + '|' + miRNA_disease['Source(s)_y']
miRNA_disease = miRNA_disease.drop(columns=['Source(s)_x', 'Source(s)_y'])

miRNA_disease.drop(columns=['experimentally_confirmed'],inplace=True)
miRNA_disease

***
* [TAM](http://www.lirmed.com/tam2/) <br /> TAM groups miRNAs into six categories of miRNA sets: miRNA-family sets, miRNA cluster sets, miRNA-disease, miRNA-function sets, miRNA-TF sets and tissue specificity sets.

In [None]:
! wget http://www.lirmed.com/tam2/Public/static/data/mirset_v9.txt

In [None]:
TAM = pd.read_csv(unprocessed_data_location+'mirset_v9.txt', sep='\t',names=range(500))
TAM = TAM.dropna(axis=1, how='all')
miRNA_disease2 = TAM[TAM[0]==("HMDD")]
miRNA_disease2[1] = miRNA_disease2[1].str.lower()
miRNA_disease2 = miRNA_disease2.dropna(axis=1, how='all')
miRNA_disease2 = miRNA_disease2.drop(columns=[0])
miRNA_disease2

In [None]:
miRNA_disease2['merged'] = miRNA_disease2[miRNA_disease2.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
miRNA_disease2 = miRNA_disease2[[1,'merged']]

miRNA_disease2['merged'] = miRNA_disease2.merged.str.split(',')
miRNA_disease2 = miRNA_disease2.explode('merged')
miRNA_disease2.rename(columns={1: 'disease', 'merged': 'mir_id'}, inplace=True)

miRNA_disease2['Source(s)'] = 'TAM' 

miRNA_disease2

In [None]:
miRNA_disease = pd.merge(miRNA_disease,miRNA_disease2,how='outer',on=['mir_id','disease'])

miRNA_disease['Source(s)_x'] = miRNA_disease['Source(s)_x'].astype(str)
miRNA_disease['Source(s)_y'] = miRNA_disease['Source(s)_y'].astype(str)
miRNA_disease['Source(s)'] = miRNA_disease['Source(s)_x'] + '|' + miRNA_disease['Source(s)_y']
miRNA_disease = miRNA_disease.drop(columns=['Source(s)_x', 'Source(s)_y'])

miRNA_disease

***
* [miRcancer](http://mircancer.ecu.edu/) <br /> miRCancer provides comprehensive collection of miRNA expression profiles in various human cancers which are automatically extracted from published literatures in PubMed.

In [None]:
! wget http://mircancer.ecu.edu/downloads/miRCancerJune2020.txt

In [None]:
miRNA_disease2 = pd.read_csv(unprocessed_data_location + 'miRCancerJune2020.txt',sep='\t', encoding='latin1')
miRNA_disease2['Source(s)'] = 'miRCancer'
miRNA_disease2_ref = pd.DataFrame(miRNA_disease2['PubMed Article'].unique(), columns=["PubMed Article"])

Entrez.email = 'emanuelecavalleri@email.com'
def convert_to_pmid(article_title):
    handle = Entrez.esearch(db="pubmed", term=article_title)
    record = Entrez.read(handle)
    handle.close()
    if record["IdList"]:
        return record["IdList"][0]
    else:
        return None

# NCBI works with some limits
miRNA_disease2_ref = np.array_split(miRNA_disease2_ref, 7)
miRNA_disease2_ref[0]["PMID"] = miRNA_disease2_ref[0]["PubMed Article"].apply(convert_to_pmid)
miRNA_disease2_ref[1]["PMID"] = miRNA_disease2_ref[1]["PubMed Article"].apply(convert_to_pmid)
miRNA_disease2_ref[2]["PMID"] = miRNA_disease2_ref[2]["PubMed Article"].apply(convert_to_pmid)
miRNA_disease2_ref[3]["PMID"] = miRNA_disease2_ref[3]["PubMed Article"].apply(convert_to_pmid)
miRNA_disease2_ref[4]["PMID"] = miRNA_disease2_ref[4]["PubMed Article"].apply(convert_to_pmid)
miRNA_disease2_ref[4]["PMID"] = miRNA_disease2_ref[5]["PubMed Article"].apply(convert_to_pmid)
miRNA_disease2_ref[4]["PMID"] = miRNA_disease2_ref[6]["PubMed Article"].apply(convert_to_pmid)
miRNA_disease2_ref = pd.concat(miRNA_disease2_ref)
miRNA_disease2_ref.to_csv(unprocessed_data_location + 'miRCancerJune2020_ref.txt', sep='\t', index=None)

miRNA_disease2 = pd.merge(miRNA_disease2, miRNA_disease2_ref, on='PubMed Article', how='left')

miRNA_disease2.drop(columns=['PubMed Article'],inplace=True)
miRNA_disease2

In [None]:
miRNA_disease2.rename(columns={'mirId':'mir_id', 'Cancer':'disease'}, inplace=True)
miRNA_disease = pd.merge(miRNA_disease,miRNA_disease2,how='outer',on=['mir_id','disease'])

miRNA_disease['pmid_x'].fillna(miRNA_disease['PMID'], inplace=True)
miRNA_disease['pmid_x'] = miRNA_disease['pmid_x'].astype(str)
miRNA_disease['PMID'] = miRNA_disease['PMID'].astype(str)
miRNA_disease['PMID'] = miRNA_disease['PMID'].replace('None', np.nan)
miRNA_disease.loc[(miRNA_disease['pmid_y']!="<NA>") & (miRNA_disease['pmid_x']!="<NA>") & (~miRNA_disease['PMID'].isna()) &
                  (miRNA_disease['pmid_x'] != miRNA_disease['PMID']) &
                  (miRNA_disease['pmid_y'] != miRNA_disease['PMID']),
                  ["pmid_x"]] = miRNA_disease["pmid_x"] + '|' + miRNA_disease['PMID']
miRNA_disease.drop(columns=['PMID'],inplace=True)
miRNA_disease.drop(columns=['pmid_y'],inplace=True)

miRNA_disease

In [None]:
miRNA_disease1 = pd.merge(miRNA_disease,desc_do_map.rename(columns={'desc': 'disease'}),
                        on=['disease'])
miRNA_disease1.insert(1, 'doid', miRNA_disease1.pop("doid"))

miRNA_disease1=pd.merge(miRNA_disease1, doid_mondo_map.rename(columns={0:'doid'}), on='doid')
miRNA_disease1.drop(columns=['doid'],inplace=True)
miRNA_disease1.insert(1, 1, miRNA_disease1.pop(1))

miRNA_disease2 = pd.merge(miRNA_disease,desc_disPhe_map.rename(columns={0: 'disease'}),
                         on=['disease'])
miRNA_disease = pd.concat([miRNA_disease1,miRNA_disease2]).drop_duplicates()
miRNA_disease = pd.merge(mirna_mirbase_map.rename(columns={2: 'mir_id'}), miRNA_disease, on='mir_id')
miRNA_disease.drop(columns=['mir_id','disease'], inplace=True)

miRNA_disease['Source(s)_x'] = miRNA_disease['Source(s)_x'].astype(str)
miRNA_disease['Source(s)_y'] = miRNA_disease['Source(s)_y'].astype(str)
miRNA_disease['Source(s)'] = miRNA_disease['Source(s)_x'] + '|' + miRNA_disease['Source(s)_y']
miRNA_disease = miRNA_disease.drop(columns=['Source(s)_x', 'Source(s)_y'])

miRNA_disease

Manual fix of some inconsistency.

In [None]:
miRNA_disease['SNP'] = "https://www.ncbi.nlm.nih.gov/snp/" + miRNA_disease['SNP']
miRNA_disease['SNP'] = miRNA_disease['SNP'].replace('https://www.ncbi.nlm.nih.gov/snp/', np.nan)

miRNA_disease['pmid_x'] = miRNA_disease['pmid_x'].str.replace('nan\||\|nan', '', regex=True)
miRNA_disease['pmid_x'] = miRNA_disease['pmid_x'].str.replace('.0', '', regex=True)
miRNA_disease['pmid_x'] = miRNA_disease['pmid_x'].str.replace('|', '|https://pubmed.ncbi.nlm.nih.gov/')
miRNA_disease['pmid_x'] = 'https://pubmed.ncbi.nlm.nih.gov/' + miRNA_disease['pmid_x']
miRNA_disease['pmid_x'] = miRNA_disease['pmid_x'].replace('https://pubmed.ncbi.nlm.nih.gov/nan', np.nan)
miRNA_disease['pmid_x'] = miRNA_disease['pmid_x'].replace('https://pubmed.ncbi.nlm.nih.gov/<NA>', np.nan)

miRNA_disease['Source(s)'] = miRNA_disease['Source(s)'].str.replace('nan\||\|nan', '', regex=True)

- Evidence.

In [None]:
miRNA_disease.drop(columns=[4],inplace=True)
miRNA_disease.evidence = miRNA_disease.evidence.str.lower()
miRNA_disease.evidence = miRNA_disease.evidence.str.replace(', ', '|')
miRNA_disease.evidence = miRNA_disease.evidence.str.replace('_', ' ')
miRNA_disease.evidence.unique()[:5]

- Experimental design.

In [None]:
miRNA_disease.ExperimentalDesign.unique()

In [None]:
miRNA_disease.ExperimentalDesign = miRNA_disease.ExperimentalDesign.str.replace(
    'blood','http://purl.obolibrary.org/obo/UBERON_0000178 (blood)')
miRNA_disease.ExperimentalDesign = miRNA_disease.ExperimentalDesign.str.replace(
    'body fluid','http://purl.obolibrary.org/obo/UBERON_0006314 (body fluid)')
miRNA_disease.ExperimentalDesign = miRNA_disease.ExperimentalDesign.str.replace(
    'metastasis','http://purl.obolibrary.org/obo/NCIT_C19151 (metastasis)')
miRNA_disease.ExperimentalDesign = miRNA_disease.ExperimentalDesign.str.replace(
    'exosomes','http://purl.obolibrary.org/obo/NCIT_C104457 (exosome)')
miRNA_disease.ExperimentalDesign.unique()

- Gene.

In [None]:
miRNA_disease.gene_name.unique()

In [None]:
miRNA_disease = pd.merge(miRNA_disease, symbol_entrez_map.rename(columns={'0_x':'gene_name'}), on='gene_name', how='left')
miRNA_disease['0_y'] = miRNA_disease['0_y'].astype('Int64').astype('str')
miRNA_disease['0_y'] = "https://pubmed.ncbi.nlm.nih.gov/" + miRNA_disease['0_y']
miRNA_disease['0_y'].replace('https://pubmed.ncbi.nlm.nih.gov/<NA>', np.nan, inplace=True)
miRNA_disease.drop(columns=['gene_name'],inplace=True)
miRNA_disease

- Method.

In [None]:
miRNA_disease['method'].unique()[:5]

In [None]:
miRNA_disease.method = miRNA_disease.method.str.lower()
miRNA_disease.method = miRNA_disease.method.str.replace(' etc', '')
miRNA_disease.method = miRNA_disease.method.str.replace(', ', '|')
miRNA_disease.method = miRNA_disease.method.str.replace(' + ', '|')
miRNA_disease.method = [replace_with_ncit(item) for item in miRNA_disease.method]
miRNA_disease.method.unique()[:5]

- Cancer subtype.

In [None]:
miRNA_disease.CancerSubtype = miRNA_disease.CancerSubtype.str.lower()
miRNA_disease.CancerSubtype = [replace_with_mondo(item) for item in miRNA_disease.CancerSubtype]
miRNA_disease.CancerSubtype.unique()

- Cell line.

In [None]:
clo_dict = dict(zip(desc_clo_map[0], 'http://purl.obolibrary.org/obo/' + desc_clo_map[1] +
                    ' (' + desc_clo_map[0] + ')'))

def replace_with_clo(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([clo_dict.get(part, part) for part in substring.split('|')])

miRNA_disease.Cellline = miRNA_disease.Cellline.str.lower()
miRNA_disease.Cellline = miRNA_disease.Cellline.str.replace(',', ' cell|')
miRNA_disease.Cellline = miRNA_disease.Cellline + ' cell'
miRNA_disease.Cellline = [replace_with_clo(item) for item in miRNA_disease.Cellline]
miRNA_disease.Cellline.unique()

In [None]:
miRNA_disease[5].unique()

In [None]:
miRNA_disease.rename(columns={0:'miRNA', 1:'Disease', 5:'Description', 'pmid_x':'References (PMID)','method':'Method',
'evidence':'Evidence','CancerSubtype':'Cancer subtype','Cellline':'Cell line','ExperimentalDesign':'Experimental design',
'adjPvalue':'FDR','refseq_id':'RefSeq ID','distance':'Distance','0_y':'Gene'}, inplace=True)

Let's divide miRNA-disease interactions into mature_miRNA-disease interactions and stem-loop_miRNA-disease interactions.

In [None]:
maturemiRNA_disease = miRNA_disease[miRNA_disease['miRNA'].str.startswith('MIMAT')]
premiRNA_disease = miRNA_disease[~miRNA_disease['miRNA'].str.startswith('MIMAT')]
merge_rows(maturemiRNA_disease,'miRNA','Disease').to_csv(
    edge_data_location + 'RmiRNA-disease.txt', sep='\t', index=None)
merge_rows(premiRNA_disease,'miRNA','Disease').to_csv(
    edge_data_location + 'RpremiRNA-disease.txt', sep='\t', index=None)

***
### miRNA-lncRNA - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [miRNet](https://www.mirnet.ca/)

In [None]:
miRNA_lncRNA = pd.read_csv(unprocessed_data_location + "miRNet-mir-lncRNA.csv")
miRNA_lncRNA.drop(columns=['mirnet','mir_acc','entrez','embl','gene_name','mbv'],inplace=True)
miRNA_lncRNA['Source(s)'] = 'miRNet'
miRNA_lncRNA

In [None]:
# Are all miRNA molecules human ones?
any(miRNA_lncRNA['mir_id'].str.contains("hsa"))

***
* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/) <br />  LncRNAWiki is devoted to community curation of human long non-coding RNAs (lncRNAs) to provide a comprehensive and up-to-date resource of functionally annotated lncRNAs. It incorporates a comprehensive collection of experimentally studied lncRNAs and integrates a wealth of their annotations based on a standardized curation model, and improves curation quality through expert curator review and community error report. 

In [None]:
LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
miRNA_lncRNA2 = LncRNAWiki[LncRNAWiki['target_type'].notna()]
miRNA_lncRNA2 = miRNA_lncRNA2[miRNA_lncRNA2.target_type.str.contains('miRNA')]
miRNA_lncRNA2.target = 'hsa-'+miRNA_lncRNA2.target
miRNA_lncRNA2.drop(columns=['gene_locus','synonyms','gene_id','transcript_id','target_interaction'],inplace=True)

miRNA_lncRNA2['Source(s)'] = 'LncRNAWiki'
miRNA_lncRNA2

In [None]:
miRNA_lncRNA2.rename(columns={'target': 'mir_id'}, inplace=True)
miRNA_lncRNA = pd.merge(miRNA_lncRNA, miRNA_lncRNA2, how='outer', on=['mir_id','symbol'])

miRNA_lncRNA['Source(s)_x'] = miRNA_lncRNA['Source(s)_x'].astype(str)
miRNA_lncRNA['Source(s)_y'] = miRNA_lncRNA['Source(s)_y'].astype(str)
miRNA_lncRNA['Source(s)'] = miRNA_lncRNA['Source(s)_x'] + '|' + miRNA_lncRNA['Source(s)_y']
miRNA_lncRNA = miRNA_lncRNA.drop(columns=['Source(s)_x', 'Source(s)_y'])

miRNA_lncRNA

***
* [SomamiR](https://compbio.uthsc.edu/SomamiR/)

In [None]:
! wget https://compbio.uthsc.edu/SomamiR/download/lncRNA_somatic_v2.0.txt.tar.gz

In [None]:
miRNA_lncRNA2 = pd.read_csv(unprocessed_data_location+'lncRNA_somatic_v2.0.txt.tar.gz',sep='\t')
miRNA_lncRNA2.drop(columns=['Unnamed: 18'],inplace=True)
miRNA_lncRNA2['Gene'] = miRNA_lncRNA2['Gene'].str.replace(r'lnc-', '')
miRNA_lncRNA2.rename(columns={'Gene': 'symbol', 'miRNA':'mir_id'}, inplace=True)
miRNA_lncRNA2['Source(s)'] = 'SomamiR'
miRNA_lncRNA2

In [None]:
miRNA_lncRNA = pd.merge(miRNA_lncRNA, miRNA_lncRNA2, how='outer', on=['mir_id','symbol'])

miRNA_lncRNA['pmid'].fillna(miRNA_lncRNA['PMID'], inplace=True)
miRNA_lncRNA['pmid'] = miRNA_lncRNA['pmid'].astype(str)
miRNA_lncRNA['PMID'] = miRNA_lncRNA['PMID'].astype(str)
miRNA_lncRNA.loc[(miRNA_lncRNA['pmid']!="<NA>") & (miRNA_lncRNA['PMID']!="<NA>") &
                 (miRNA_lncRNA['pmid'] != miRNA_lncRNA['PMID']),
                 ["pmid"]] = miRNA_lncRNA["pmid"] + '|' + miRNA_lncRNA['PMID']

miRNA_lncRNA.drop(columns=['PMID'],inplace=True)

miRNA_lncRNA = pd.merge(miRNA_lncRNA, mirna_mirbase_map.rename(columns={2:'mir_id'}), on='mir_id')
miRNA_lncRNA = pd.merge(miRNA_lncRNA, symbol_entrez_map[['0_x','0_y']].rename(columns={'0_x':'symbol'}), on='symbol')
miRNA_lncRNA.insert(1, '0_y', miRNA_lncRNA.pop("0_y"))

miRNA_lncRNA['Source(s)_x'] = miRNA_lncRNA['Source(s)_x'].astype(str)
miRNA_lncRNA['Source(s)_y'] = miRNA_lncRNA['Source(s)_y'].astype(str)
miRNA_lncRNA['Source(s)'] = miRNA_lncRNA['Source(s)_x'] + '|' + miRNA_lncRNA['Source(s)_y']
miRNA_lncRNA = miRNA_lncRNA.drop(columns=['Source(s)_x', 'Source(s)_y', 'symbol', 'genome_variation',
                                          'variation_detail', 'epigenetic_modification', 'modification_detail',
                                          'molecular_function'])

miRNA_lncRNA

***
* [LncBook](https://ngdc.cncb.ac.cn/lncbook/) <br />  LncBook accommodates a high-quality collection of 95,243 human lncRNA genes and 323,950 lncRNA transcripts, and incorporates their abundant annotations at different omics levels, thereby enabling users to decipher functional signatures of lncRNAs in human diseases and different biological contexts.

In [None]:
! wget https://ngdc.cncb.ac.cn/lncbook/files/lncrna_mirna_miRandaAndTargetScanAndRNAhybrid_LncBook2.0.csv.gz

In [None]:
# issue: 3h for complete download of data. BTW, miRanda and TargetScan have already been integrated in RNA-KG.

Manual fix of some inconsistency

In [None]:
#miRNA_lncRNA2 = miRNA_lncRNA.copy()
miRNA_lncRNA = miRNA_lncRNA2.copy()

In [None]:
miRNA_lncRNA['Source(s)'] = miRNA_lncRNA['Source(s)'].str.replace('nan\||\|nan', '', regex=True)
miRNA_lncRNA['pmid'] = miRNA_lncRNA['pmid'].str.replace('nan\||\|nan', '', regex=True)
miRNA_lncRNA['pmid'] = miRNA_lncRNA['pmid'].str.replace('.0', '', regex=True)
miRNA_lncRNA['pmid'] = miRNA_lncRNA['pmid'].str.replace('|', '|https://pubmed.ncbi.nlm.nih.gov/')
miRNA_lncRNA['pmid'] = 'https://pubmed.ncbi.nlm.nih.gov/' + miRNA_lncRNA['pmid']
miRNA_lncRNA['pmid'] = miRNA_lncRNA['pmid'].replace('https://pubmed.ncbi.nlm.nih.gov/nan', np.nan)

- Conservation ortholog.

In [None]:
print(miRNA_lncRNA.conservation_ortholog.unique())
miRNA_lncRNA.conservation_ortholog = miRNA_lncRNA.conservation_ortholog.str.replace('Human;7SL', 'https://www.ncbi.nlm.nih.gov/gene/6029')
miRNA_lncRNA.conservation_ortholog = miRNA_lncRNA.conservation_ortholog.str.replace('Mouse', 'http://purl.obolibrary.org/obo/NCBITaxon_10090 (mus musculus)')

- Biological context.

In [None]:
print(miRNA_lncRNA.biological_context.unique())
miRNA_lncRNA.biological_context = miRNA_lncRNA.biological_context.str.replace('Disease', 'http://purl.obolibrary.org/obo/MONDO_0000001 (disease)')

- Context detail.

In [None]:
miRNA_lncRNA.context_detail = [replace_with_mondo(item) for item in miRNA_lncRNA.context_detail]
miRNA_lncRNA.context_detail.unique()[:5]

- Expression.

In [None]:
print(miRNA_lncRNA['expression'].unique())
miRNA_lncRNA['expression'] = miRNA_lncRNA['expression'].str.replace(
    'RNA', 'http://purl.obolibrary.org/obo/SO_0000356 (rna)')

- Regulator type.

In [None]:
print(miRNA_lncRNA['regulator_type'].unique())
miRNA_lncRNA['regulator_type'] = miRNA_lncRNA['regulator_type'].str.replace('Protein',
                                                                            'http://purl.obolibrary.org/obo/PR_000000001 (protein)')
miRNA_lncRNA['regulator_type'] = miRNA_lncRNA['regulator_type'].str.replace('TF',
                                                                            'http://purl.obolibrary.org/obo/NCIT_C17207 (transcription factor)')

- Regulator.

In [None]:
print(miRNA_lncRNA['regulator'].unique())
miRNA_lncRNA = pd.merge(miRNA_lncRNA, symbol_to_pro[['0_x','1_y']].rename(columns={'0_x':'regulator'}), on='regulator', how='left')
miRNA_lncRNA['1_y'] = "http://purl.obolibrary.org/obo/" + miRNA_lncRNA['1_y'].astype(str)
miRNA_lncRNA['1_y'] = miRNA_lncRNA['1_y'].replace('http://purl.obolibrary.org/obo/nan', np.nan)
miRNA_lncRNA = miRNA_lncRNA.drop(columns=['regulator'])
miRNA_lncRNA = miRNA_lncRNA.rename(columns={'1_y' : 'Regulator'})
miRNA_lncRNA.insert(2, 'Regulator', miRNA_lncRNA.pop('Regulator'))

- Regulator interaction.

In [None]:
print(miRNA_lncRNA['regulator_interaction'].unique())
miRNA_lncRNA['regulator_interaction'] = miRNA_lncRNA[
    'regulator_interaction'].str.replace('Protein-DNA',
                                         'http://purl.obolibrary.org/obo/NCIT_C18755 (dna-protein interaction)')
miRNA_lncRNA['regulator_interaction'] = miRNA_lncRNA[
    'regulator_interaction'].str.replace('Protein-RNA',
                                         'http://purl.obolibrary.org/obo/NCIT_C19019 (rna-protein interaction)')

- Regulator effect.

In [None]:
print(miRNA_lncRNA['regulator_effect'].unique())
miRNA_lncRNA['regulator_effect'] = miRNA_lncRNA[
    'regulator_effect'].str.replace('promote', 'http://purl.obolibrary.org/obo/NCIT_C61391 (promotion)')
miRNA_lncRNA['regulator_effect'] = miRNA_lncRNA[
    'regulator_effect'].str.replace('inhibit', 'http://purl.obolibrary.org/obo/NCIT_C42791 (inhibition)')

- Target type.

In [None]:
print(miRNA_lncRNA['target_type'].unique())
miRNA_lncRNA['target_type'] = miRNA_lncRNA[
    'target_type'].str.replace('miRNA', 'http://purl.obolibrary.org/obo/SO_0000276 (mirna)')

- Target effect.

In [None]:
print(miRNA_lncRNA['target_effect'].unique())
miRNA_lncRNA['target_effect'] = miRNA_lncRNA['target_effect'].str.replace('promote', 'http://purl.obolibrary.org/obo/NCIT_C61391 (promotion)')
miRNA_lncRNA['target_effect'] = miRNA_lncRNA['target_effect'].str.replace('inhibit', 'http://purl.obolibrary.org/obo/NCIT_C42791 (inhibition)')
miRNA_lncRNA['target_effect'] = miRNA_lncRNA['target_effect'].str.replace('Inhibit', 'http://purl.obolibrary.org/obo/NCIT_C42791 (inhibition)')

- Functional mechanism.

In [None]:
print(miRNA_lncRNA['functional_mechanism'].unique())
miRNA_lncRNA['target_effect'] = miRNA_lncRNA[
    'target_effect'].str.replace('promote', 'http://purl.obolibrary.org/obo/NCIT_C61391 (promotion)')

print(miRNA_lncRNA['clinical_detail'].unique())
miRNA_lncRNA['clinical_detail'] = miRNA_lncRNA[
    'clinical_detail'].str.replace('metastasis', 'http://purl.obolibrary.org/obo/NCIT_C19151 (metastasis)')
miRNA_lncRNA['clinical_detail'] = miRNA_lncRNA[
    'clinical_detail'].str.replace('drug', 'http://purl.obolibrary.org/obo/CHEBI_23888 (drug)')

- Disease.

In [None]:
miRNA_lncRNA.context_detail = [replace_with_mondo(item) for item in miRNA_lncRNA.context_detail]
miRNA_lncRNA.context_detail.unique()[:5]

- Tissue/cell line.

In [None]:
clo_dict2 = dict(zip(desc_clo_map[0].str.replace(' cell',''), 'http://purl.obolibrary.org/obo/' +
                     desc_clo_map[1] + ' (' + desc_clo_map[0] + ')'))

def replace_with_clo2(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([clo_dict2.get(part, part) for part in substring.split('|')])

miRNA_lncRNA['tissue/cell line'] = miRNA_lncRNA['tissue/cell line'].str.lower()
miRNA_lncRNA['tissue/cell line'] = miRNA_lncRNA['tissue/cell line'].str.replace(';', '|')
miRNA_lncRNA['tissue/cell line'] = [replace_with_clo(item) for item in miRNA_lncRNA['tissue/cell line']]
miRNA_lncRNA['tissue/cell line'] = [replace_with_clo2(item) for item in miRNA_lncRNA['tissue/cell line']]
miRNA_lncRNA['tissue/cell line'] = [replace_with_uberon(item) for item in miRNA_lncRNA['tissue/cell line']]
miRNA_lncRNA['tissue/cell line'].unique()[:5]

- Experimental method.

In [None]:
print(miRNA_lncRNA['experimental_method'].unique()[:5])
miRNA_lncRNA['experimental_method'] = miRNA_lncRNA['experimental_method'].str.replace('wetern', 'western')
miRNA_lncRNA['experimental_method'] = miRNA_lncRNA['experimental_method'].str.replace('assay assay', 'assay')
miRNA_lncRNA['experimental_method'] = miRNA_lncRNA['experimental_method'].str.lower()
miRNA_lncRNA['experimental_method'] = miRNA_lncRNA['experimental_method'].str.replace(';', '|')
miRNA_lncRNA['experimental_method'] = [replace_with_ncit(item) for item in miRNA_lncRNA['experimental_method']]

- Biological process.

In [None]:
go_dict = dict(zip(desc_go_map[0], 'http://purl.obolibrary.org/obo/' +
                   desc_go_map[1] + ' (' + desc_go_map[0] + ')'))

def replace_with_go(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([go_dict.get(part, part) for part in substring.split('|')])

In [None]:
print(miRNA_lncRNA['biological_process'].unique()[:5])
miRNA_lncRNA['biological_process'] = miRNA_lncRNA['biological_process'].str.lower()
miRNA_lncRNA['biological_process'] = miRNA_lncRNA['biological_process'].str.replace(';', '|')
miRNA_lncRNA['biological_process'] = [replace_with_go(item) for item in miRNA_lncRNA['biological_process']]

- Pathway.

In [None]:
pw_dict = dict(zip(desc_reactome_map[0], 'http://purl.obolibrary.org/obo/' + desc_reactome_map[1] +
                  ' (' + desc_reactome_map[0] + ')'))

def replace_with_pw(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([pw_dict.get(part, part) for part in substring.split('|')])
    
print(miRNA_lncRNA['pathway'].unique()[:5])
miRNA_lncRNA['pathway'] = miRNA_lncRNA['pathway'].str.lower()
miRNA_lncRNA['pathway'] = miRNA_lncRNA['pathway'].str.replace(';', '|')
miRNA_lncRNA['pathway'] = miRNA_lncRNA['pathway'].str.replace('nf-??b', 'nuclear factor kappa B')
miRNA_lncRNA['pathway'] = miRNA_lncRNA['pathway'].str.replace('/??-catenin', '')
miRNA_lncRNA['pathway'] = miRNA_lncRNA['pathway'].str.replace('pi3k/akt ', '')
miRNA_lncRNA['pathway'] = [replace_with_pw(item) for item in miRNA_lncRNA['pathway']]

- Drug.

In [None]:
# special thank to DrugBank for making me access this supplementary data!
DrugBank = pd.read_csv(unprocessed_data_location + 'drugbank vocabulary.csv') 
DrugBank['Common name'] = DrugBank['Common name'].str.lower()
DrugBank

In [None]:
drugbank_dict = dict(zip(DrugBank['Common name'], 'https://go.drugbank.com/drugs/' + DrugBank['DrugBank ID'] +
                        ' (' + DrugBank['Common name'] + ')'))
def replace_with_drugbank(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([drugbank_dict.get(part, part) for part in substring.split('|')])

print(miRNA_lncRNA['drug'].unique())
miRNA_lncRNA['drug'] = miRNA_lncRNA['drug'].str.lower()
miRNA_lncRNA['drug'] = miRNA_lncRNA['drug'].str.replace(';', '|')

miRNA_lncRNA['drug'] = [replace_with_drugbank(item) for item in miRNA_lncRNA['drug']]

- Chromosome.

In [None]:
print(miRNA_lncRNA['Chromosome'].unique()[:5])
miRNA_lncRNA['Chromosome'] = miRNA_lncRNA['Chromosome'].str.lower()
miRNA_lncRNA['Chromosome'] = [replace_with_ncit(item) for item in miRNA_lncRNA['Chromosome']]

- Cancer class.

In [None]:
print(miRNA_lncRNA['Cancer_Class'].unique()[:5])
miRNA_lncRNA['Cancer_Class'] = miRNA_lncRNA['Cancer_Class'].str.lower()
miRNA_lncRNA['Cancer_Class'] = miRNA_lncRNA['Cancer_Class'].str.replace("[ns]", ']')
miRNA_lncRNA['Cancer_Class'] = miRNA_lncRNA['Cancer_Class'].str.replace("]", '|')
miRNA_lncRNA['Cancer_Class'] = miRNA_lncRNA['Cancer_Class'].str.replace("[", '')
miRNA_lncRNA['Cancer_Class'] = miRNA_lncRNA['Cancer_Class'].str.replace(", ", '|')
miRNA_lncRNA['Cancer_Class'] = miRNA_lncRNA['Cancer_Class'].str.replace("||", '|')
miRNA_lncRNA['Cancer_Class'] = miRNA_lncRNA['Cancer_Class'].str.replace('_', ' ')
miRNA_lncRNA['Cancer_Class'] = miRNA_lncRNA['Cancer_Class'].str.replace('lung/sclc', 'small cell lung carcinoma')
miRNA_lncRNA['Cancer_Class'] = miRNA_lncRNA['Cancer_Class'].replace(r'\(.*?\)', '', regex=True)
miRNA_lncRNA['Cancer_Class'] = miRNA_lncRNA['Cancer_Class'].replace(r'\|$', '', regex=True)
miRNA_lncRNA['Cancer_Class'] = [replace_with_mondo(item) for item in miRNA_lncRNA['Cancer_Class']]

In [None]:
miRNA_lncRNA.insert(0, 0, miRNA_lncRNA.pop(0))
miRNA_lncRNA = miRNA_lncRNA.drop(columns=['mir_id'])

In [None]:
miRNA_lncRNA.rename(columns={0:'miRNA'}, inplace=True)
miRNA_lncRNA.columns = [col.capitalize().replace('_', ' ') for col in miRNA_lncRNA.columns]
miRNA_lncRNA.rename(columns={'Mirna':'miRNA', '0 y':'lncRNA', 'Pmid': 'References (PMID)', 'Cosmic id':'Cosmic ID', 'Targetscan site(0=no;1=yes)':
                             'Targetscan site'}, inplace=True)

In [None]:
miRNA_lncRNA['lncRNA'] = miRNA_lncRNA['lncRNA'].astype(str)+'?lncRNA'
maturemiRNA_lncRNA = miRNA_lncRNA[miRNA_lncRNA['miRNA'].str.startswith('MIMAT')]
premiRNA_lncRNA = miRNA_lncRNA[~miRNA_lncRNA['miRNA'].str.startswith('MIMAT')]
merge_rows(maturemiRNA_lncRNA, 'miRNA', 'lncRNA').to_csv(
    edge_data_location + 'RmiRNA-lncRNA.txt', sep='\t', index=None)
merge_rows(premiRNA_lncRNA, 'miRNA', 'lncRNA').drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-lncRNA.txt', sep='\t', index=None)

***
### SNP-miRNA - http://purl.obolibrary.org/obo/RO_0002566 (causally influences)

* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
! wget https://www.dropbox.com/s/cu4hv35ulu3a8d6/miRNet-snp-mir-hsa.csv?dl=0

In [None]:
miRNA_variant = pd.read_csv(unprocessed_data_location + "miRNet-snp-mir-hsa.csv?dl=0")
miRNA_variant = miRNA_variant[miRNA_variant['High_Confidence']=='YES']
miRNA_variant.drop(columns=['mirnet','chr_pos','Mature_Name','Mature_Acc','Mature_Pos',
                            'Robust_FANTOM5','Conserved_ADmiRE', 'Family_Name',
                            'AF_Percentile_gnomAD','Phylop_100way','Phastcons_100way','High_Confidence'],
                   inplace=True)
miRNA_variant['Source(s)'] = 'miRNet'
miRNA_variant

***
* [miRdSNP](http://mirdsnp.ccr.buffalo.edu/)

In [None]:
! wget http://mirdsnp.ccr.buffalo.edu/downloads/mirdsnp-dsnp-generated-mir-targets-v11.03.csv

In [None]:
miRNA_variant2 = pd.read_csv(unprocessed_data_location+'mirdsnp-dsnp-generated-mir-targets-v11.03.csv')
miRNA_variant2.rename(columns={'SNP':'rsid','miR':'MIRNA_Name'},inplace=True)
miRNA_variant2['Source(s)'] = 'miRdSNP'
miRNA_variant2

In [None]:
miRNA_variant = pd.merge(miRNA_variant, miRNA_variant2, how='outer', on=['rsid','MIRNA_Name'])
miRNA_variant['Source(s)_x'] = miRNA_variant['Source(s)_x'].astype(str)
miRNA_variant['Source(s)_y'] = miRNA_variant['Source(s)_y'].astype(str)
miRNA_variant['Source(s)'] = miRNA_variant['Source(s)_x'] + '|' + miRNA_variant['Source(s)_y']
miRNA_variant = miRNA_variant.drop(columns=['Source(s)_x', 'Source(s)_y', 'MIRNA_Acc'])
miRNA_variant

In [None]:
miRNA_variant['Source(s)'] = miRNA_variant['Source(s)'].str.replace('nan\||\|nan', '', regex=True)

miRNA_variant = pd.merge(miRNA_variant, symbol_entrez_map[['0_x','0_y']].rename(columns={'0_x':'gene_name'}), on='gene_name', how='left')
miRNA_variant['0_y'] = "https://www.ncbi.nlm.nih.gov/gene/" + miRNA_variant['0_y'].astype('Int64').astype('str')
miRNA_variant['0_y'] = miRNA_variant['0_y'].replace('https://www.ncbi.nlm.nih.gov/gene/nan', np.nan)
miRNA_variant = miRNA_variant.drop(columns=['gene_name'])

miRNA_variant.insert(len(miRNA_variant.columns)-1, 'Source(s)', miRNA_variant.pop('Source(s)'))

Manual fix of inconsistencies.

- Disease.

In [None]:
miRNA_variant['diseases'] = miRNA_variant['diseases'].str.lower()
miRNA_variant['diseases'] = miRNA_variant['diseases'].str.replace(", ", '|')
miRNA_variant['diseases'] = [replace_with_mondo(item) for item in miRNA_variant['diseases']]
miRNA_variant['diseases'].unique()[:5]

In [None]:
miRNA_variant.rename(columns={'rsid':'SNP', 'gnomAD_MAF': 'gnomAD MAF', 'MIRNA_Domain': 'miRNA domain',
                              'Precursor_Pos': 'Precursor position', 'Predicted_Motif':'Predicted motif',
                              'refseq_id': 'Refseq ID', 'diseases': 'Disease(s)', 'distance':'Distance',
                              'experimentally_confirmed':'Experimentally confirmed', '0_y':'Gene'}, inplace=True)

In [None]:
miRNA_variant = pd.merge(miRNA_variant, mirna_mirbase_map.rename(columns={2:'MIRNA_Name'}), on='MIRNA_Name')
miRNA_variant.insert(0, 0, miRNA_variant.pop(0))
miRNA_variant.rename(columns={0:'miRNA'}, inplace=True)
miRNA_variant = miRNA_variant.drop(columns=['MIRNA_Name'])

maturemiRNA_variant = miRNA_variant[miRNA_variant['miRNA'].str.startswith('MIMAT')]
premiRNA_variant = miRNA_variant[~miRNA_variant['miRNA'].str.startswith('MIMAT')]
merge_rows(maturemiRNA_variant, 'miRNA', 'SNP').to_csv(
    edge_data_location + 'RmiRNA-variant.txt', sep='\t', index=None)
merge_rows(premiRNA_variant, 'miRNA', 'SNP').to_csv(
    edge_data_location + 'RpremiRNA-variant.txt', sep='\t', index=None)

***
### SNP-gene - http://purl.obolibrary.org/obo/RO_0002566 (causally influences)

* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
! wget https://www.dropbox.com/s/8aq8k0yoy5ak0d6/miRNet-snpmirbs-hsa.csv?dl=0

In [None]:
gene_variant = pd.read_csv(unprocessed_data_location + "miRNet-snpmirbs-hsa.csv?dl=0").drop(
    columns=['mirnet', 'chr_pos', 'symbol'])
gene_variant['Source(s)'] = 'miRNet'
gene_variant = gene_variant[['rsid', 'entrez', 'transcript_id', 'Source(s)']]
gene_variant = gene_variant.rename(columns={'rsid':'SNP', 'entrez':'Gene', 'transcript_id':'Refseq ID'})
gene_variant

In [None]:
gene_variant.drop_duplicates().to_csv(edge_data_location + 'Rvariant-gene.txt', sep='\t', index=None)

***
### SNP-disease - http://purl.obolibrary.org/obo/RO_0002566 (causally influences)

* [miRdSNP](http://mirdsnp.ccr.buffalo.edu/index.php)

In [None]:
disease_variant = pd.read_csv(unprocessed_data_location+'mirdsnp-dsnps-v11.03.csv')
disease_variant.drop(columns=['pub_year','pub_month','link','journal','article_date','title'],inplace=True)
disease_variant.disease = disease_variant.disease.str.lower()
disease_variant['snps'] = disease_variant.snps.str.split(',')
disease_variant = disease_variant.explode('snps')

disease_variant = pd.merge(disease_variant, desc_disPhe_map.rename(columns={0:'disease'}),on='disease')
disease_variant.drop(columns=['disease'],inplace=True)

disease_variant['Source(s)'] = 'miRdSNP' 
disease_variant = disease_variant.rename(columns={'pubmed_id':'References (PMID)','snps':'SNP', 1:'Disease'})
disease_variant = disease_variant[['SNP', 'Disease', 'References (PMID)', 'Source(s)']]

disease_variant['References (PMID)'] = 'https://pubmed.ncbi.nlm.nih.gov/' + disease_variant['References (PMID)'].astype('Int64').astype('str')
disease_variant['References (PMID)'] = disease_variant['References (PMID)'].replace('https://pubmed.ncbi.nlm.nih.gov/nan', np.nan)

disease_variant

In [None]:
merge_rows(disease_variant, 'SNP', 'Disease').to_csv(edge_data_location + 'Rvariant-disease.txt', sep='\t', index=None)

***
### SNP-TF - http://purl.obolibrary.org/obo/RO_0002566 (causally influences)

* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
! wget https://www.dropbox.com/s/f87f2q9ryjs3il9/miRNet-snptfbs-hsa.csv?dl=0

In [None]:
TF_variant = pd.read_csv(unprocessed_data_location + "miRNet-snptfbs-hsa.csv?dl=0")
TF_variant.drop(columns=['chr_pos','mirnet','entrez','name'],inplace=True)

TF_variant = pd.merge(TF_variant,symbol_to_pro.rename(columns={'0_x':'symbol'}),on=['symbol'])
TF_variant.drop(columns=['symbol'],inplace=True)
TF_variant.rename(columns={'rsid':'SNP', '1_y':'TF'},inplace=True)
TF_variant['Source(s)'] = 'miRNet'

TF_variant

In [None]:
TF_variant.drop_duplicates().to_csv(
    edge_data_location + 'Rvariant-TF.txt', sep='\t', index=None)

***
### tsRNA-miRNA - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [tsRFun](https://rna.sysu.edu.cn/tsRFun/index.php) <br /> tsRFun is a platform for tsRNA functions by High-throughput Small RNA-Seq and CLIP-Seq Data.

In [None]:
! wget https://rna.sysu.edu.cn/tsRFun/download/tsRNetwork/all_hypgm_df.txt

In [None]:
tsRNA_miRNA = pd.read_csv(unprocessed_data_location + 'all_hypgm_df.txt', sep="\t")  
tsRNA_miRNA

In [None]:
# We consider pairs with FDR < 0.01
tsRNA_miRNA = tsRNA_miRNA[tsRNA_miRNA['adj.p'] < 0.01]
# We also remove unadjusted p-val column since we have FDR
tsRNA_miRNA.drop('p', axis=1, inplace=True)
# We rename columns
tsRNA_miRNA.rename(columns={'adj.p':'FDR'}, inplace=True)
tsRNA_miRNA

In [None]:
tsRNA_miRNA = pd.merge(tsRNA_miRNA,mirna_mirbase_map.rename(columns={2:'miRNA'}), on=['miRNA'])
tsRNA_miRNA.drop(columns='miRNA',inplace=True)
tsRNA_miRNA = pd.merge(tsRNA_miRNA,tsRNA_tRF_map.rename(columns={'tRNA':'tsRNA'}), on=['tsRNA'])
tsRNA_miRNA.drop(columns='tsRNA',inplace=True)
tsRNA_miRNA.rename(columns={0:'miRNA','tsRNAid':'tsRNA'}, inplace=True)
tsRNA_miRNA['Source(s)'] = 'tsRFun'
tsRNA_miRNA = tsRNA_miRNA[['miRNA','tsRNA','FDR','Source(s)']]
tsRNA_miRNA

In [None]:
tsRNA_miRNA.drop_duplicates().to_csv(
    edge_data_location + 'RtsRNA-miRNA.txt', sep='\t', index=None)

***
### tsRNA-disease - http://purl.obolibrary.org/obo/RO_0003302 (causes or contributes to condition)

* [tsRFun](https://rna.sysu.edu.cn/tsRFun/index.php)

In [None]:
! wget https://rna.sysu.edu.cn/tsRFun/download/tsRinCancer/allCancer_0.txt

In [None]:
tsRNA_disease = pd.read_csv(unprocessed_data_location + 'allCancer_0.txt', sep="\t", index_col=0)  
tsRNA_disease

In [None]:
# We keep only log2FC columns
tsRNA_disease = tsRNA_disease.loc[:, tsRNA_disease.columns.str.endswith('_log2FC')]
tsRNA_disease.columns = tsRNA_disease.columns.str.replace(r'_log2FC', '')

# tsRNA has a relationship with cancer iff |log2FC| >= 1
tsRNA_disease[abs(tsRNA_disease) < 1] = 0
tsRNA_disease

In [None]:
# We want a dataframe with 2 columns, tRF and associated cancer;
# this is an example with ACC 
tRF=[]
log2FC=[]
for index, row in tsRNA_disease.iterrows():
     if row['ACC'] != 0:
            tRF.append(index)
            log2FC.append(row['ACC'])
            
df_acc = pd.DataFrame (tRF, columns = ['tRF'])
df_acc['dis'] = 'ACC'
df_acc['log2FC'] = log2FC
df_acc

In [None]:
# Empty dataframe to store processed rows
trRF_disease = pd.DataFrame(columns = ["tRF", "dis"])

log2FC=[]
for cancer in tsRNA_disease.columns:    
    tRF=[]
    for index, row in tsRNA_disease.iterrows():
         if row[cancer] != 0:
            tRF.append(index)
            log2FC.append(row[cancer])
    
    df = pd.DataFrame(tRF, columns = ['tRF'])
    df['dis'] = cancer
    
    trRF_disease = pd.concat([trRF_disease, df], ignore_index=True)
    
trRF_disease['log2FC'] = log2FC
trRF_disease

In [None]:
trRF_disease = pd.merge(trRF_disease, cancer_mondo_map.rename(columns={0:'dis'}), on='dis')
trRF_disease.drop(columns=['dis'],inplace=True)
trRF_disease.rename(columns={1:'Disease'}, inplace=True)
trRF_disease['Source(s)'] = 'tsRFun'
trRF_disease = trRF_disease[['tRF','Disease','log2FC','Source(s)']]
trRF_disease

In [None]:
merge_rows(trRF_disease, "tRF", 'Disease').to_csv(edge_data_location + 'RtsRNA-disease.txt', sep='\t', index=None)

***
### tRF-tRNA - http://purl.obolibrary.org/obo/RO_0002202 (develops from)

* [tRFdb](http://genome.bioch.virginia.edu/trfdb/index.php) <br /> tRFdb is a comprehensive database of tRFs prepared from publicly available high-throughput sequencing data of >50 short RNA libraries. tRFs originate precisely from the extreme 5' (tRF-5) or 3' ends (tRF-3) of mature tRNAs or from the 3' trailer sequence of precursor tRNA transcripts (tRF-1) and are present in humans, mice, flies, worms and yeasts.

In [None]:
#http://genome.bioch.virginia.edu/trfdb/index.php
tRF1_tRNA = pd.read_html(unprocessed_data_location+'trf1.html')[2]
tRF1_tRNA.drop(columns=['Organism','Experiment Info','Sequence','tRNA Gene Co-ordinates'],inplace=True)
tRF1_tRNA.head()

In [None]:
tRF3_tRNA = pd.read_html(unprocessed_data_location+'trf3.html')[2]
tRF3_tRNA.drop(columns=['Organism','Experiment Info','Sequence','tRNA Gene Co-ordinates'],inplace=True)
tRF3_tRNA.head()

In [None]:
tRF5_tRNA = pd.read_html(unprocessed_data_location+'trf5.html')[2]
tRF5_tRNA.drop(columns=['Organism','Experiment Info','Sequence','tRNA Gene Co-ordinates'],inplace=True)
tRF5_tRNA.head()

In [None]:
tRF_tRNA = pd.concat([tRF1_tRNA,tRF3_tRNA,tRF5_tRNA])
tRF_tRNA['Source(s)'] = 'tRFdb'
tRF_tRNA = tRF_tRNA.drop(columns=['Type'])
tRF_tRNA.head()

In [None]:
tRF_tRNA.rename(columns={'tRF ID':'tRF','tRNA Name':'tRNA'},inplace=True)

In [None]:
tRF_tRNA.drop_duplicates().to_csv(edge_data_location + 'RtRF-tRNA_tRFdb.txt', sep='\t', index=None)

***
* [MINTbase](https://cm.jefferson.edu/MINTbase/) <br /> The Mitochondrial and Nuclear tRNA fragment database (MINTbase) is a repository of tRNA fragments (tRFs).

In [None]:
# https://cm.jefferson.edu/MINTbase/InputController?g=GRCh37&d=y&v=g&e=1.0&cl=,4,5,11,12,16,18,19,21,22,26,27,#ttop
tRF_tRNA2 = pd.read_csv(unprocessed_data_location+'MINTbasetRF-tRNA.txt',sep='\t')
tRF_tRNA2['MINTbase Alternative IDs (GRCh37 assembly-derived)'] = tRF_tRNA2['MINTbase Alternative IDs (GRCh37 assembly-derived)'].str.split('@').str[0]
tRF_tRNA2.rename(columns={'MINTbase Alternative IDs (GRCh37 assembly-derived)':'MINTbase tRNA name'},inplace=True)
tRF_tRNA2 = pd.merge(tRF_tRNA2, tRNA_MINTbase_GtRNAdb_map, on='MINTbase tRNA name')
tRF_tRNA2.drop(columns=['MINTbase tRNA name'],inplace=True)
# For post-processing issues
tRF_tRNA2['gtRNAdb name'] = tRF_tRNA2['gtRNAdb name'].astype(str)+'.html'
tRF_tRNA2 = tRF_tRNA2[['License Plate (sequence derived)','Expressed (# of datasets)?','Maximum RPM','gtRNAdb name']]
tRF_tRNA2['Source(s)'] = 'MINTbase'
tRF_tRNA2.rename(columns={'License Plate (sequence derived)':'tRF','Expressed (# of datasets)?':'Expressed (# of datasets)',
                          'gtRNAdb name':'tRNA'},inplace=True)
tRF_tRNA2 = tRF_tRNA2[['tRF','tRNA','Expressed (# of datasets)','Maximum RPM','Source(s)']]

tRF_tRNA2['Expressed (# of datasets)'] = tRF_tRNA2['Expressed (# of datasets)'].str.replace("yes (", "")
tRF_tRNA2['Expressed (# of datasets)'] = tRF_tRNA2['Expressed (# of datasets)'].str.replace(")", "")
tRF_tRNA2['Expressed (# of datasets)'] = tRF_tRNA2['Expressed (# of datasets)'].astype('Int64')

tRF_tRNA2

In [None]:
tRF_tRNA2.drop_duplicates().to_csv(edge_data_location + 'RtRF-tRNA_MINTbase.txt', sep='\t', index=None)

***
### tRF-cell line - http://purl.obolibrary.org/obo/RO_0001025 (located in)

* [tRFdb](http://genome.bioch.virginia.edu/trfdb/index.php)

In [None]:
#http://genome.bioch.virginia.edu/trfdb/index.php
tRF1_tRNA = pd.read_html(unprocessed_data_location+'trf1.html')[2]
tRF1_tRNA.drop(columns=['Organism'],inplace=True)
tRF3_tRNA = pd.read_html(unprocessed_data_location+'trf3.html')[2]
tRF3_tRNA.drop(columns=['Organism'],inplace=True)
tRF5_tRNA = pd.read_html(unprocessed_data_location+'trf5.html')[2]
tRF5_tRNA.drop(columns=['Organism'],inplace=True)

tRF_tRNA = pd.concat([tRF1_tRNA,tRF3_tRNA,tRF5_tRNA])
tRF_tRNA = tRF_tRNA.drop(columns=['Experiment Info', 'Sequence'])
tRF_tRNA['tRF ID'] = tRF_tRNA['tRF ID'].astype(str)
tRF_tRNA

In [None]:
df = pd.DataFrame()

for tRF_ID in tRF_tRNA['tRF ID']:
    url = f"http://genome.bioch.virginia.edu/trfdb/experiments_display.php?trf_id={tRF_ID}&organism=human"
    response = requests.get(url)
    temp = pd.read_html(response.content)[0].drop(columns=['GEO / SRA Links','View Alignment','Graph Alignment'])
    temp['tRF ID'] = tRF_ID
    df = pd.concat([df, temp], ignore_index=True)
df.Source = df.Source.str.lower().str.replace('-', ' ').str.replace('normal ', '').str.replace('whole', '')
df['Source'] = df['Source'].apply(lambda x: x + ' cell' if not x.endswith(' cell') else x)
df = df[df['Abundance'] != 0]
df

In [None]:
df.Source.unique()

In [None]:
df_gsm = df[df['Experiment'].str.startswith('GSM')]
df_sra = df[df['Experiment'].str.startswith('SR')]
df_gsm['Experiment'] = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" + df_gsm['Experiment']
df_sra['Experiment'] = "https://www.ncbi.nlm.nih.gov/sra/query/acc.cgi?acc=" + df_sra['Experiment']
df = pd.concat([df_gsm, df_sra])

In [None]:
df = pd.merge(df.rename(columns={'Source':0}), desc_clo_map).drop(columns=[0])
df.rename(columns={1:'Cell line', 'tRF ID':'tRF'}, inplace=True)
df = df[['tRF', 'Cell line', 'Experiment','Abundance']]
df

In [None]:
merge_rows(df, "tRF", 'Cell line').to_csv(edge_data_location + 'RtRF-cellLine.txt', sep='\t', index=None)

***
### tRNA-modification - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [GtRNAdb](http://gtrnadb.ucsc.edu/GtRNAdb2/index.html) <br /> 
The genomic tRNA database contains tRNA gene predictions made by tRNAscan-SE on complete or nearly complete genomes. Unless otherwise noted, all annotation is automated, and has not been inspected for agreement with published literature.

In [None]:
identifiers = []
seq = []

fasta_file_path = unprocessed_data_location + 'hg38-tRNAs.fa'

with open(fasta_file_path) as fasta_file:
    for title, sequence in SimpleFastaParser(fasta_file):
        identifiers.append(title.split(None, 1)[0])  # First word is ID
        seq.append(sequence)
        
data = {"Identifier": identifiers, "Sequence": seq}
df = pd.DataFrame(data)
df['Identifier'] = df['Identifier'].str[len('Homo_sapiens_'):]
df

In [None]:
tRNA = pd.DataFrame()
for identifier in df['Identifier'] [1:] :

    temp = pd.read_html('http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/genes/' + identifier + '.html')[0].T
    temp2 = pd.read_html('http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/genes/' + identifier + '.html')[1].T
    temp = pd.concat([temp,temp2],axis=1)
    temp.columns = temp.iloc[0]
    temp = temp[1:]
    tRNA = pd.concat([tRNA, temp])

tRNA.Locus = tRNA.Locus.str.replace(' View in Genome Browser', '')
tRNA = tRNA[['GtRNAdb Gene Symbol', 'Known Modifications (Modomics)']]
tRNA

In [None]:
tRNA['Known Modifications (Modomics)'] = tRNA['Known Modifications (Modomics)'].str.split(' ')
tRNA_mod = tRNA[['GtRNAdb Gene Symbol', 'Known Modifications (Modomics)']].explode('Known Modifications (Modomics)').dropna()
tRNA_mod = tRNA_mod.rename(columns={'GtRNAdb Gene Symbol':'tRNA', 'Known Modifications (Modomics)':'Modification'})
tRNA_mod['tRNA'] = tRNA_mod['tRNA'] + '.html'
tRNA_mod


In [None]:
tRNA_mod.drop_duplicates().to_csv(edge_data_location + 'RtRNA-mod.txt', sep='\t', index=None)

***
### tRNA-amino acid - http://purl.obolibrary.org/obo/RO_0002436 (molecularly interacts with)

* [tRNAdb](http://trna.bioinf.uni-leipzig.de/DataOutput/) <br /> tRNAdb contains more than 12 000 tRNA genes, classified into families according to amino acid specificity. The database provides various services including graphical representations of tRNA secondary structures, a customizable output of aligned or un-aligned sequences with a variety of individual and combinable search criteria, as well as the construction of consensus sequences for any selected set of tRNAs.

In [None]:
#http://trna.bioinf.uni-leipzig.de/DataOutput/Result
tRNA_aa = pd.read_html(unprocessed_data_location+'tRNAdb - Transfer RNA database.html')[3]
tRNA_aa.drop(columns=[0,1,2,4,19,20],inplace=True)
tRNA_aa.rename(columns=tRNA_aa.iloc[0], inplace=True)
tRNA_aa = tRNA_aa.iloc[2:]
tRNA_aa.head()

In [None]:
tRNA_aa = tRNA_MINTbase_GtRNAdb_map[['gtRNAdb name']]
tRNA_aa['gtRNAdb name'] = tRNA_aa['gtRNAdb name'] + '.html'
tRNA_aa['new'] = tRNA_MINTbase_GtRNAdb_map['gtRNAdb name'].str.split("-").str[1]
tRNA_aa['Amino Acid'] = tRNA_aa[tRNA_aa['new']=='tRNA']['gtRNAdb name'].str.split("-").str[2]
tRNA_aa['Amino Acid'].fillna(tRNA_aa['new'],inplace=True)
tRNA_aa.drop(columns=['new'],inplace=True)
tRNA_aa

In [None]:
tRNA_aa = pd.merge(tRNA_aa, aa_chebi_map.rename(columns={0:'Amino Acid'}), on='Amino Acid')
tRNA_aa.drop(columns=['Amino Acid'],inplace=True)
tRNA_aa.rename(columns={'gtRNAdb name': 'tRNA', 1:'Amino acid'},inplace=True)
tRNA_aa['Source(s)'] = 'tRNAdb'
tRNA_aa

In [None]:
tRNA_aa.drop_duplicates().to_csv(edge_data_location + 'RtRNA-aminoacid.txt', sep='\t', index=None)

***
### snoRNA-gene - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/) <br /> snoDB is an interactive database of human small nucleolar RNAs (snoRNAs) that includes up-to-date information on snoRNA features, genomic location, conservation, host gene, snoRNA-RNA targets and snoRNA abundance and provides links to other resources.

In [None]:
data_downloader('https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/download_all', unprocessed_data_location)

In [None]:
snoDB = pd.read_csv(unprocessed_data_location + 'download_all', sep="\t")
snoDB = snoDB[['ncbi_id','host_gene_name','target_count','rrna_targets','snrna_targets','lncrna_targets',
               'protein_coding_targets','snorna_targets','mirna_targets','trna_targets','ncrna_targets',
               'pseudogene_targets','other_targets','is_expressed']]
snoDB = snoDB[snoDB['ncbi_id'].notna()]
snoDB['ncbi_id'] = snoDB['ncbi_id'].astype('Int64')
snoDB

In [None]:
snoRNA_gene = snoDB[['ncbi_id', 'host_gene_name', 'target_count', 'is_expressed']]
snoRNA_gene = pd.merge(symbol_entrez_map.rename(columns={'0_x':'host_gene_name'}), snoRNA_gene, on='host_gene_name')
snoRNA_gene.ncbi_id = snoRNA_gene.ncbi_id.astype(str)+'?snoRNA'
snoRNA_gene.drop(columns=['host_gene_name'],inplace=True)
snoRNA_gene['Source(s)'] = 'snoDB'
snoRNA_gene.rename(columns={'ncbi_id':'snoRNA', 'target_count':'Target count','0_y':'Gene',
                            'is_expressed':'Expressed (True/False)'}, inplace=True)
snoRNA_gene = snoRNA_gene[['snoRNA', 'Gene', 'Target count', 'Expressed (True/False)', 'Source(s)']]
snoRNA_gene['Target count'] = snoRNA_gene['Target count'].astype('Int64').astype(str)
snoRNA_gene['Target count'] = snoRNA_gene['Target count'].replace("<NA>", np.nan)
snoRNA_gene

In [None]:
snoRNA_gene.drop_duplicates().to_csv(edge_data_location + 'RsnoRNA-gene.txt', sep='\t', index=None)

***
### snoRNA-miRNA - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_miRNA = snoDB[['ncbi_id', 'mirna_targets', 'target_count', 'is_expressed']]
snoRNA_miRNA = snoRNA_miRNA[snoRNA_miRNA['mirna_targets'].notna()]
snoRNA_miRNA = snoRNA_miRNA[snoRNA_miRNA['ncbi_id'].notna()]
snoRNA_miRNA['ncbi_id'] = pd.to_numeric(snoRNA_miRNA['ncbi_id'], downcast='integer')
snoRNA_miRNA.ncbi_id = snoRNA_miRNA.ncbi_id.astype(str)+'?snoRNA'
snoRNA_miRNA

In [None]:
snoRNA_miRNA['miRBase_id'] = [np.nan, 'MI0000075']
snoRNA_miRNA.drop(columns='mirna_targets',inplace=True)
snoRNA_miRNA['Source(s)'] = 'snoDB'
snoRNA_miRNA['target_count'] = snoRNA_miRNA['target_count'].astype('Int64').astype(str)
snoRNA_miRNA['target_count'] = snoRNA_miRNA['target_count'].replace("<NA>", np.nan)

# Up to now, no miRBase ID is associated with AC008521
snoRNA_miRNA = pd.DataFrame(snoRNA_miRNA.loc[891]).T
snoRNA_miRNA

* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
# https://www.dropbox.com/s/gpt1yrwoe1h2gx7/miRNet-mir-sncRNA.csv?dl=0
snoRNA_miRNA2 = pd.read_csv(unprocessed_data_location + 'miRNet-mir-sncRNA.csv')
snoRNA_miRNA2 = snoRNA_miRNA2[snoRNA_miRNA2.gene_name.str.contains('small nucleolar')]
snoRNA_miRNA2.drop(columns=['mirnet','mir_id','symbol','embl','gene_name','mbv'],inplace=True)
snoRNA_miRNA2.entrez = snoRNA_miRNA2.entrez.astype(str)+'?snoRNA'
snoRNA_miRNA2.rename(columns={'mir_acc':'miRBase_id', 'entrez':'ncbi_id'},inplace=True)
snoRNA_miRNA2['Source(s)'] = 'miRNet'
snoRNA_miRNA2

In [None]:
snoRNA_miRNA = pd.concat([snoRNA_miRNA,snoRNA_miRNA2])
snoRNA_miRNA.rename(columns={'ncbi_id':'snoRNA', 'target_count':'Target count', 'is_expressed':'Expressed (True/False)',
                             'miRBase_id':'miRNA'}, inplace=True)
snoRNA_miRNA = snoRNA_miRNA[['snoRNA', 'miRNA', 'Target count', 'Expressed (True/False)', 'Source(s)']]
snoRNA_miRNA

In [None]:
maturesnoRNA_miRNA = snoRNA_miRNA[snoRNA_miRNA['miRNA'].str.startswith('MIMAT')]
presnoRNA_miRNA = snoRNA_miRNA[~snoRNA_miRNA['miRNA'].str.startswith('MIMAT')]

maturesnoRNA_miRNA.drop_duplicates().to_csv(edge_data_location + 'RsnoRNA-miRNA.txt', sep='\t', index=None)
presnoRNA_miRNA.drop_duplicates().to_csv(edge_data_location + 'RsnoRNA-premiRNA.txt', sep='\t', index=None)

***
### snoRNA-snoRNA - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_snoRNA = snoDB[['ncbi_id', 'snorna_targets', 'target_count', 'is_expressed']]
snoRNA_snoRNA = snoRNA_snoRNA[snoRNA_snoRNA['snorna_targets'].notna()]
snoRNA_snoRNA

In [None]:
snoRNA_snoRNA['processed_targets'] = snoRNA_snoRNA.snorna_targets.str.split(';')
snoRNA_snoRNA = snoRNA_snoRNA.explode('processed_targets')
snoRNA_snoRNA.drop('snorna_targets', axis=1, inplace=True)
snoRNA_snoRNA = pd.merge(symbol_entrez_map.rename(columns={'0_x':'processed_targets'}), snoRNA_snoRNA, on='processed_targets')
snoRNA_snoRNA.ncbi_id = snoRNA_snoRNA.ncbi_id.astype('Int64').astype(str)+'?snoRNA'
snoRNA_snoRNA.drop(columns=['processed_targets'],inplace=True)
snoRNA_snoRNA = snoRNA_snoRNA[~snoRNA_snoRNA['ncbi_id'].str.startswith('<NA>')]
snoRNA_snoRNA['0_y'] = snoRNA_snoRNA['0_y'].astype(str)+'?snoRNA'
snoRNA_snoRNA['Source(s)'] = 'snoDB'
snoRNA_snoRNA['target_count'] = snoRNA_snoRNA['target_count'].astype('Int64').astype(str)
snoRNA_snoRNA['target_count'] = snoRNA_snoRNA['target_count'].replace("<NA>", np.nan)
snoRNA_snoRNA.rename(columns={'ncbi_id':'snoRNA1', 'target_count':'Target count', '0_y':'snoRNA2',
                              'is_expressed':'Expressed (True/False)'}, inplace=True)
snoRNA_snoRNA = snoRNA_snoRNA[['snoRNA1', 'snoRNA2', 'Target count', 'Expressed (True/False)', 'Source(s)']]
snoRNA_snoRNA

In [None]:
snoRNA_snoRNA.drop_duplicates().to_csv(edge_data_location + 'RsnoRNA-snoRNA.txt', sep='\t', index=None)

***
### snoRNA-lncRNA - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_lncRNA = snoDB[['ncbi_id', 'lncrna_targets', 'target_count', 'is_expressed']]
snoRNA_lncRNA = snoRNA_lncRNA[snoRNA_lncRNA['lncrna_targets'].notna()]

# Same reasoning of snoRNA-miRNA
snoRNA_lncRNA['lncrna_targets'] = snoRNA_lncRNA.lncrna_targets.str.split(';')
snoRNA_lncRNA = snoRNA_lncRNA.explode('lncrna_targets')

snoRNA_lncRNA.head()

In [None]:
snoRNA_lncRNA = pd.merge(symbol_entrez_map.rename(columns={'0_x':'lncrna_targets'}), snoRNA_lncRNA, on='lncrna_targets')
snoRNA_lncRNA.ncbi_id = snoRNA_lncRNA.ncbi_id.astype('Int64').astype(str)+'?snoRNA'
snoRNA_lncRNA['0_y'] = snoRNA_lncRNA['0_y'].astype('Int64').astype(str)+'?lncRNA'
snoRNA_lncRNA.drop(columns=['lncrna_targets'],inplace=True)
snoRNA_lncRNA = snoRNA_lncRNA[~snoRNA_lncRNA['ncbi_id'].str.startswith('<NA>')]
snoRNA_lncRNA['Source(s)'] = 'snoDB'
snoRNA_lncRNA['target_count'] = snoRNA_lncRNA['target_count'].astype('Int64').astype(str)
snoRNA_lncRNA['target_count'] = snoRNA_lncRNA['target_count'].replace("<NA>", np.nan)
snoRNA_lncRNA.rename(columns={'ncbi_id':'snoRNA', 'target_count':'Target count', '0_y':'lncRNA',
                              'is_expressed':'Expressed (True/False)'}, inplace=True)
snoRNA_lncRNA = snoRNA_lncRNA[['snoRNA', 'lncRNA', 'Target count', 'Expressed (True/False)', 'Source(s)']]
snoRNA_lncRNA

***
### snoRNA-snRNA - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_snRNA = snoDB[['ncbi_id','snrna_targets', 'target_count', 'is_expressed']]
snoRNA_snRNA = snoRNA_snRNA[snoRNA_snRNA['snrna_targets'].notna()]

# Same reasoning of snoRNA-miRNA and snoRNA-lncRNA
snoRNA_snRNA['snrna_targets'] = snoRNA_snRNA.snrna_targets.str.split(';')
snoRNA_snRNA = snoRNA_snRNA.explode('snrna_targets')

snoRNA_snRNA

In [None]:
snoRNA_snRNA = pd.merge(symbol_entrez_map.rename(columns={'0_x':'snrna_targets'}), snoRNA_snRNA, on='snrna_targets')
snoRNA_snRNA.ncbi_id = snoRNA_snRNA.ncbi_id.astype('Int64').astype(str)+'?snoRNA'
snoRNA_snRNA['0_y'] = snoRNA_snRNA['0_y'].astype('Int64').astype(str)+'?snRNA'
snoRNA_snRNA.drop(columns=['snrna_targets'],inplace=True)
snoRNA_snRNA = snoRNA_snRNA[~snoRNA_snRNA['ncbi_id'].str.startswith('<NA>')]
snoRNA_snRNA['Source(s)'] = 'snoDB'
snoRNA_snRNA['target_count'] = snoRNA_snRNA['target_count'].astype('Int64').astype(str)
snoRNA_snRNA['target_count'] = snoRNA_snRNA['target_count'].replace("<NA>", np.nan)
snoRNA_snRNA.rename(columns={'ncbi_id':'snoRNA', 'target_count':'Target count', '0_y':'snRNA',
                              'is_expressed':'Expressed (True/False)'}, inplace=True)
snoRNA_snRNA = snoRNA_snRNA[['snoRNA', 'snRNA', 'Target count', 'Expressed (True/False)', 'Source(s)']]
snoRNA_snRNA.head()

In [None]:
snoRNA_snRNA.to_csv(edge_data_location + 'RsnoRNA-snRNA.txt', sep='\t', index=None)

***
### snoRNA-rRNA - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_rRNA = snoDB[['ncbi_id','rrna_targets', 'target_count', 'is_expressed']]
snoRNA_rRNA = snoRNA_rRNA[snoRNA_rRNA['rrna_targets'].notna()]

# Same reasoning of snoRNA-miRNA and snoRNA-lncRNA
snoRNA_rRNA['rrna_targets'] = snoRNA_rRNA.rrna_targets.str.split(';')
snoRNA_rRNA = snoRNA_rRNA.explode('rrna_targets')

snoRNA_rRNA

In [None]:
snoRNA_rRNA.ncbi_id = snoRNA_rRNA.ncbi_id.astype('Int64').astype(str)+'?snoRNA'
snoRNA_rRNA['rrna_targets'] = snoRNA_rRNA['rrna_targets'].astype(str)+'?snoDBrRNA'
snoRNA_rRNA = snoRNA_rRNA[~snoRNA_rRNA['ncbi_id'].str.startswith('<NA>')]
snoRNA_rRNA['Source(s)'] = 'snoDB'
snoRNA_rRNA['target_count'] = snoRNA_rRNA['target_count'].astype('Int64').astype(str)
snoRNA_rRNA['target_count'] = snoRNA_rRNA['target_count'].replace("<NA>", np.nan)
snoRNA_rRNA.rename(columns={'ncbi_id':'snoRNA', 'target_count':'Target count', 'rrna_targets':'rRNA',
                              'is_expressed':'Expressed (True/False)'}, inplace=True)
snoRNA_rRNA = snoRNA_rRNA[['snoRNA', 'rRNA', 'Target count', 'Expressed (True/False)', 'Source(s)']]
snoRNA_rRNA

In [None]:
snoRNA_rRNA.drop_duplicates().to_csv(edge_data_location + 'RsnoRNA-rRNA.txt', sep='\t', index=None)

***
### snoRNA-mRNA - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_mRNA = snoDB[['ncbi_id','protein_coding_targets', 'target_count', 'is_expressed']]
snoRNA_mRNA = snoRNA_mRNA[snoRNA_mRNA['protein_coding_targets'].notna()]

# Same reasoning of snoRNA-miRNA and snoRNA-lncRNA
snoRNA_mRNA['protein_coding_targets'] = snoRNA_mRNA.protein_coding_targets.str.split(';')
snoRNA_mRNA = snoRNA_mRNA.explode('protein_coding_targets')

snoRNA_mRNA

In [None]:
snoRNA_mRNA = pd.merge(symbol_entrez_map.rename(columns={'0_x':'protein_coding_targets'}),
                       snoRNA_mRNA, on='protein_coding_targets')
snoRNA_mRNA.ncbi_id = snoRNA_mRNA.ncbi_id.astype('Int64').astype(str)+'?snoRNA'
snoRNA_mRNA['0_y'] = snoRNA_mRNA['0_y'].astype('Int64').astype(str)+'?mRNA'
snoRNA_mRNA.drop(columns=['protein_coding_targets'],inplace=True)
snoRNA_mRNA = snoRNA_mRNA[~snoRNA_mRNA['ncbi_id'].str.startswith('<NA>')]
snoRNA_mRNA['Source(s)'] = 'snoDB'
snoRNA_mRNA['target_count'] = snoRNA_mRNA['target_count'].astype('Int64').astype(str)
snoRNA_mRNA['target_count'] = snoRNA_mRNA['target_count'].replace("<NA>", np.nan)
snoRNA_mRNA.rename(columns={'ncbi_id':'snoRNA', 'target_count':'Target count', '0_y':'mRNA',
                              'is_expressed':'Expressed (True/False)'}, inplace=True)
snoRNA_mRNA = snoRNA_mRNA[['snoRNA', 'mRNA', 'Target count', 'Expressed (True/False)', 'Source(s)']]
snoRNA_mRNA

In [None]:
snoRNA_mRNA.drop_duplicates().to_csv(edge_data_location + 'RsnoRNA-mRNA.txt', sep='\t', index=None)

***
### snoRNA-tRNA - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_tRNA = snoDB[['ncbi_id','trna_targets', 'target_count', 'is_expressed']]
snoRNA_tRNA = snoRNA_tRNA[snoRNA_tRNA['trna_targets'].notna()]

# Same reasoning of snoRNA-miRNA and snoRNA-lncRNA
snoRNA_tRNA['trna_targets'] = snoRNA_tRNA.trna_targets.str.split(';')
snoRNA_tRNA = snoRNA_tRNA.explode('trna_targets')
snoRNA_tRNA['trna_targets'] = snoRNA_tRNA['trna_targets'].astype(str)
snoRNA_tRNA['trna_targets'] = snoRNA_tRNA['trna_targets'].str.replace('_TRNA','')
snoRNA_tRNA['trna_targets'] = snoRNA_tRNA['trna_targets'].str.replace('_','')

snoRNA_tRNA['Source(s)'] = 'snoDB'
snoRNA_tRNA.ncbi_id = snoRNA_tRNA.ncbi_id.astype('Int64').astype(str)+'?snoRNA'
snoRNA_tRNA['target_count'] = snoRNA_tRNA['target_count'].astype('Int64').astype(str)
snoRNA_tRNA['target_count'] = snoRNA_tRNA['target_count'].replace("<NA>", np.nan)
snoRNA_tRNA.rename(columns={'ncbi_id':'snoRNA', 'target_count':'Target count', 'trna_targets':'tRNA',
                              'is_expressed':'Expressed (True/False)'}, inplace=True)
snoRNA_tRNA = snoRNA_tRNA[['snoRNA', 'tRNA', 'Target count', 'Expressed (True/False)', 'Source(s)']]

snoRNA_tRNA

In [None]:
snoRNA_tRNA.drop_duplicates().to_csv(edge_data_location + 'RsnoRNA-tRNA.txt', sep='\t', index=None)

***
### snoRNA-(non-specifically-classified) ncRNA

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_ncRNA = snoDB[['ncbi_id','ncrna_targets', 'target_count', 'is_expressed']]
snoRNA_ncRNA = snoRNA_ncRNA[snoRNA_ncRNA['ncrna_targets'].notna()]

# Same reasoning of snoRNA-miRNA and snoRNA-lncRNA
snoRNA_ncRNA['ncrna_targets'] = snoRNA_ncRNA.ncrna_targets.str.split(';')
snoRNA_ncRNA = snoRNA_ncRNA.explode('ncrna_targets')
snoRNA_ncRNA['Source(s)'] = 'snoDB'
snoRNA_ncRNA

In [None]:
entrez_enst_map = pd.read_csv(processed_data_location + 'ENTREZ_GENE_ENSEMBL_TRANSCRIPT_MAP.txt', sep="\t", header=None)
symbol_entrez_map = pd.merge(symbol_ensembl_map, entrez_enst_map, on=[1])
symbol_entrez_map = symbol_entrez_map[['0_x','0_y','3_x']]
symbol_entrez_map

In [None]:
snoRNA_ncRNA = pd.merge(symbol_entrez_map.rename(columns={'0_x':'ncrna_targets'}), snoRNA_ncRNA, on='ncrna_targets')
snoRNA_ncRNA.ncbi_id = snoRNA_ncRNA.ncbi_id.astype('Int64').astype(str)+'?snoRNA'
snoRNA_ncRNA.drop(columns=['ncrna_targets'],inplace=True)
snoRNA_ncRNA = snoRNA_ncRNA[~snoRNA_ncRNA['ncbi_id'].str.startswith('<NA>')]
snoRNA_ncRNA

In [None]:
snoRNA_ncRNA['0_y'] = snoRNA_ncRNA['0_y'].astype(str)+'?'+snoRNA_ncRNA['3_x'].astype(str)
snoRNA_ncRNA

In [None]:
snoRNA_ncRNA['3_x'].unique()

In [None]:
snoRNA_ncRNA['target_count'] = snoRNA_ncRNA['target_count'].astype('Int64').astype(str)
snoRNA_ncRNA['target_count'] = snoRNA_ncRNA['target_count'].replace("<NA>", np.nan)
snoRNA_ncRNA.rename(columns={'ncbi_id':'snoRNA', 'target_count':'Target count', '0_y':'ncRNA',
                              'is_expressed':'Expressed (True/False)'}, inplace=True)
snoRNA_ncRNA = snoRNA_ncRNA[['snoRNA', '3_x', 'ncRNA', 'Target count', 'Expressed (True/False)', 'Source(s)']]

In [None]:
snoRNA_lncRNA2 = snoRNA_ncRNA[snoRNA_ncRNA['3_x']=='lncRNA']
snoRNA_lncRNA2.rename(columns={'ncRNA':'lncRNA'}, inplace=True)
snoRNA_lncRNA2.drop(columns=['3_x'], inplace=True)
snoRNA_lncRNA2

In [None]:
snoRNA_lncRNA = pd.concat([snoRNA_lncRNA,snoRNA_lncRNA2])
snoRNA_lncRNA

In [None]:
snoRNA_ri = snoRNA_ncRNA[snoRNA_ncRNA['3_x']=='retained_intron']
snoRNA_ri.rename(columns={'ncRNA':'Retained intron'}, inplace=True)
snoRNA_ri.drop(columns=['3_x'], inplace=True)
snoRNA_ri

In [None]:
snoRNA_ri.drop_duplicates().to_csv(edge_data_location + 'RsnoRNA-retainedIntron.txt', sep='\t', index=None)

In [None]:
snoRNA_miscRNA = snoRNA_ncRNA[snoRNA_ncRNA['3_x']=='misc_RNA']
snoRNA_miscRNA = snoRNA_miscRNA.drop(columns=['3_x'])
snoRNA_miscRNA.rename(columns={'ncRNA':'Retained intron'}, inplace=True)
snoRNA_miscRNA

In [None]:
snoRNA_miscRNA.drop_duplicates().to_csv(edge_data_location + 'RsnoRNA-miscRNA.txt', sep='\t', index=None)

In [None]:
snoRNA_scaRNA = snoRNA_ncRNA[snoRNA_ncRNA['3_x']=='scaRNA']
snoRNA_scaRNA = snoRNA_scaRNA.drop(columns=['3_x'])
snoRNA_scaRNA.rename(columns={'ncRNA':'scaRNA'}, inplace=True)
snoRNA_scaRNA.head()

In [None]:
symbol_entrez_map.drop(columns=['3_x'],inplace=True)

***
### snoRNA-pseudogene - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_pseudogene = snoDB[['ncbi_id','pseudogene_targets', 'target_count', 'is_expressed']]
snoRNA_pseudogene = snoRNA_pseudogene[snoRNA_pseudogene['pseudogene_targets'].notna()]
snoRNA_pseudogene.ncbi_id = snoRNA_pseudogene.ncbi_id.astype('Int64').astype(str)+'?snoRNA'

# Same reasoning of snoRNA-miRNA and snoRNA-lncRNA
snoRNA_pseudogene['pseudogene_targets'] = snoRNA_pseudogene.pseudogene_targets.str.split(';')
snoRNA_pseudogene = snoRNA_pseudogene.explode('pseudogene_targets')

snoRNA_pseudogene

In [None]:
snoRNA_pseudogene['pseudogene_targets'] = ['107075265','100287215','106481730','26121',
                                           '100420364','401914','100420656','26121',
                                           '26121','100873211']
snoRNA_pseudogene['pseudogene_targets'] = snoRNA_pseudogene['pseudogene_targets']+'?pseudo'
snoRNA_pseudogene['Source(s)'] = 'snoDB'

snoRNA_pseudogene['target_count'] = snoRNA_pseudogene['target_count'].astype('Int64').astype(str)
snoRNA_pseudogene['target_count'] = snoRNA_pseudogene['target_count'].replace("<NA>", np.nan)
snoRNA_pseudogene.rename(columns={'ncbi_id':'snoRNA', 'target_count':'Target count', 'pseudogene_targets':'Pseudogene',
                              'is_expressed':'Expressed (True/False)'}, inplace=True)
snoRNA_pseudogene = snoRNA_pseudogene[['snoRNA', 'Pseudogene', 'Target count', 'Expressed (True/False)', 'Source(s)']]

snoRNA_pseudogene

In [None]:
snoRNA_pseudogene.drop_duplicates().to_csv(edge_data_location + 'RsnoRNA-pseudogene.txt', sep='\t', index=None)

***
### snoRNA-(miscellaneous of) other targets - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_other = snoDB[['ncbi_id','other_targets', 'target_count', 'is_expressed']]
snoRNA_other = snoRNA_other[snoRNA_other['other_targets'].notna()]
snoRNA_other.ncbi_id = snoRNA_other.ncbi_id.astype('Int64').astype(str)+'?snoRNA'

# Same reasoning of snoRNA-miRNA and snoRNA-lncRNA
snoRNA_other['other_targets'] = snoRNA_other.other_targets.str.split(';')
snoRNA_other = snoRNA_other.explode('other_targets')

snoRNA_other

In [None]:
snoRNA_other['other_targets'] = ['tRNA-SeC-TCA-1-1','3653?lncRNA','tRNA-SeC-TCA-1-1','3653?lncRNA',
                                 '3653?lncRNA','3653?lncRNA','3653?lncRNA','106633801?scaRNA','106633801?scaRNA']

snoRNA_other['type'] = ['tRNA','lncRNA','tRNA','lncRNA','lncRNA','lncRNA','lncRNA','scaRNA','scaRNA']
snoRNA_other['Source(s)'] = 'snoDB' 
snoRNA_other

In [None]:
snoRNA_tRNA2 = snoRNA_other[snoRNA_other['type']=='tRNA']
snoRNA_tRNA2 = snoRNA_tRNA2.drop(columns=['type'])
snoRNA_tRNA2['target_count'] = snoRNA_tRNA2['target_count'].astype('Int64').astype(str)
snoRNA_tRNA2['target_count'] = snoRNA_tRNA2['target_count'].replace("<NA>", np.nan)
snoRNA_tRNA2.rename(columns={'ncbi_id':'snoRNA', 'target_count':'Target count', 'other_targets':'tRNA',
                              'is_expressed':'Expressed (True/False)'}, inplace=True)
snoRNA_tRNA2 = snoRNA_tRNA2[['snoRNA', 'tRNA', 'Target count', 'Expressed (True/False)', 'Source(s)']]

pd.concat([snoRNA_tRNA, snoRNA_tRNA2]).drop_duplicates().to_csv(edge_data_location + 'RsnoRNA-tRNA.txt', sep='\t', index=None)

In [None]:
snoRNA_scaRNA2 = snoRNA_other[snoRNA_other['type']=='scaRNA']
snoRNA_scaRNA2 = snoRNA_scaRNA2.drop(columns=['type'])
snoRNA_scaRNA2['target_count'] = snoRNA_scaRNA2['target_count'].astype('Int64').astype(str)
snoRNA_scaRNA2['target_count'] = snoRNA_scaRNA2['target_count'].replace("<NA>", np.nan)
snoRNA_scaRNA2.rename(columns={'ncbi_id':'snoRNA', 'target_count':'Target count', 'other_targets':'scaRNA',
                              'is_expressed':'Expressed (True/False)'}, inplace=True)
snoRNA_scaRNA2 = snoRNA_scaRNA2[['snoRNA', 'scaRNA', 'Target count', 'Expressed (True/False)', 'Source(s)']]

pd.concat([snoRNA_scaRNA, snoRNA_scaRNA2]).drop_duplicates().to_csv(edge_data_location + 'RsnoRNA-scaRNA.txt', sep='\t', index=None)

In [None]:
snoRNA_lncRNA2 = snoRNA_other[snoRNA_other['type']=='lncRNA']
snoRNA_lncRNA2 = snoRNA_lncRNA2.drop(columns=['type'])
snoRNA_lncRNA2['target_count'] = snoRNA_lncRNA2['target_count'].astype('Int64').astype(str)
snoRNA_lncRNA2['target_count'] = snoRNA_lncRNA2['target_count'].replace("<NA>", np.nan)
snoRNA_lncRNA2.rename(columns={'ncbi_id':'snoRNA', 'target_count':'Target count', 'other_targets':'lncRNA',
                              'is_expressed':'Expressed (True/False)'}, inplace=True)
snoRNA_lncRNA2 = snoRNA_lncRNA2[['snoRNA', 'lncRNA', 'Target count', 'Expressed (True/False)', 'Source(s)']]

pd.concat([snoRNA_lncRNA, snoRNA_lncRNA2]).drop_duplicates().to_csv(edge_data_location + 'RsnoRNA-lncRNA.txt', sep='\t', index=None)

***
### lncRNA-gene - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
lncRNA_gene = LncRNAWiki[LncRNAWiki['target_type'].notna()]
lncRNA_gene = lncRNA_gene[lncRNA_gene.target_type.str.contains('PCG')]
lncRNA_gene = lncRNA_gene.drop(columns=['synonyms','gene_locus','target_type', 'gene_id', 'target_effect',
                                        'genome_variation', 'variation_detail', 'molecular_function'])
lncRNA_gene['target'] = lncRNA_gene['target'].str.split(';')
lncRNA_gene = lncRNA_gene.explode('target')
lncRNA_gene = pd.merge(lncRNA_gene,symbol_entrez_map.rename(columns={'0_x':'target'}), on='target')
lncRNA_gene.drop(columns=['target'],inplace=True)
lncRNA_gene = pd.merge(lncRNA_gene,symbol_entrez_map.rename(columns={'0_x':'symbol'}), on='symbol')
lncRNA_gene.drop(columns=['symbol'],inplace=True)
lncRNA_gene['0_y_y'] = lncRNA_gene['0_y_y'].astype(str)+'?lncRNA'
lncRNA_gene['Source(s)'] = 'LncRNAWiki'
lncRNA_gene

Manual fix of some inconsistency

In [None]:
lncRNA_gene.insert(0,'0_y_y',lncRNA_gene.pop('0_y_y'))
lncRNA_gene.insert(1,'0_y_x',lncRNA_gene.pop('0_y_x'))

In [None]:
lncRNA_gene['pmid'] = 'https://pubmed.ncbi.nlm.nih.gov/' + lncRNA_gene['pmid'].astype('Int64', errors='ignore').astype('str')
lncRNA_gene['pmid'] = lncRNA_gene['pmid'].replace('https://pubmed.ncbi.nlm.nih.gov/nan', np.nan)

# transcript
lncRNA_gene.transcript_id = lncRNA_gene.transcript_id.str.replace(',', '|')

# conservation ortholog
lncRNA_gene.conservation_ortholog = lncRNA_gene.conservation_ortholog.str.replace('Human;7SL', 'https://www.ncbi.nlm.nih.gov/gene/6029')
lncRNA_gene.conservation_ortholog = lncRNA_gene.conservation_ortholog.str.replace('Mouse', 'http://purl.obolibrary.org/obo/NCBITaxon_10090 (mus musculus)')

# biological context
lncRNA_gene.biological_context = lncRNA_gene.biological_context.str.replace('Disease', 'http://purl.obolibrary.org/obo/MONDO_0000001 (disease)')

# expression
lncRNA_gene['expression'] = lncRNA_gene['expression'].str.replace(
    'RNA', 'http://purl.obolibrary.org/obo/SO_0000356 (rna)')

# regulator
lncRNA_gene['regulator'] = lncRNA_gene['regulator'].str.replace(
    'HBX;TP53','PR_000008466 (hbx)|PR_Q12888 (tp53)')
lncRNA_gene = pd.merge(lncRNA_gene, symbol_to_pro[['0_x','1_y']].rename(columns={'0_x':'regulator'}), on='regulator', how='left')
lncRNA_gene['1_y'] = "http://purl.obolibrary.org/obo/" + lncRNA_gene['regulator'].astype(str)
lncRNA_gene['1_y'] = lncRNA_gene['1_y'].replace('http://purl.obolibrary.org/obo/nan', np.nan)
lncRNA_gene.drop(columns=['regulator'],inplace=True)
lncRNA_gene

# regulator type
lncRNA_gene['regulator_type'] = lncRNA_gene['regulator_type'].str.replace(';', '|')
lncRNA_gene['regulator_type'] = lncRNA_gene['regulator_type'].str.replace('Protein', 'http://purl.obolibrary.org/obo/PR_000000001 (protein)')
lncRNA_gene['regulator_type'] = lncRNA_gene['regulator_type'].str.replace('TF', 'http://purl.obolibrary.org/obo/NCIT_C17207 (transcription factor)')

# regulator interaction
lncRNA_gene['regulator_interaction'] = lncRNA_gene['regulator_interaction'].replace('NA;NA', np.nan)
lncRNA_gene['regulator_interaction'] = lncRNA_gene['regulator_interaction'].str.replace('Protein-DNA', 'http://purl.obolibrary.org/obo/NCIT_C18755 (dna-protein interaction)')
lncRNA_gene['regulator_interaction'] = lncRNA_gene['regulator_interaction'].str.replace('Protein-RNA', 'http://purl.obolibrary.org/obo/NCIT_C19019 (rna-protein interaction)')

# functional mechanism
lncRNA_gene['functional_mechanism'] = lncRNA_gene['functional_mechanism'].str.replace(';', '|')
lncRNA_gene['functional_mechanism'] = lncRNA_gene['functional_mechanism'].str.replace('Transcriptional regulation', 'http://purl.obolibrary.org/obo/NCIT_C19077 (transcriptional regulation)')
lncRNA_gene['functional_mechanism'] = lncRNA_gene['functional_mechanism'].str.replace('Epigenetic regulation', 'http://purl.obolibrary.org/obo/GO_0040029 (epigenetic regulation of gene expression)')

# epigenetic modification
lncRNA_gene['epigenetic_modification'] = lncRNA_gene['epigenetic_modification'].str.replace('DNA methylation','http://purl.obolibrary.org/obo/GO_0006306 (dna methylation)')
lncRNA_gene['epigenetic_modification'] = lncRNA_gene['epigenetic_modification'].str.replace('Histone modification','http://purl.obolibrary.org/obo/GO_0016570 (histone modification)')
lncRNA_gene['epigenetic_modification'] = lncRNA_gene['epigenetic_modification'].str.replace('Epigenetic change','http://purl.obolibrary.org/obo/NCIT_C21051 (epigenetic process)')

# regulator effect
lncRNA_gene['regulator_effect'] = lncRNA_gene['regulator_effect'].str.replace('promote;promote', 'http://purl.obolibrary.org/obo/NCIT_C61391 (promotion)')
lncRNA_gene['regulator_effect'] = lncRNA_gene['regulator_effect'].str.replace('promote', 'http://purl.obolibrary.org/obo/NCIT_C61391 (promotion)')
lncRNA_gene['regulator_effect'] = lncRNA_gene['regulator_effect'].str.replace('inhibit', 'http://purl.obolibrary.org/obo/NCIT_C42791 (inhibition)')

# modification detail
lncRNA_gene['modification_detail'] = lncRNA_gene['modification_detail'].str.replace('methylation','http://purl.obolibrary.org/obo/NCIT_C16848 (methylation)')
lncRNA_gene['modification_detail'] = lncRNA_gene['modification_detail'].str.replace('hypomethylation','http://purl.obolibrary.org/obo/NCIT_C121521 (hypomethylation)')
lncRNA_gene['modification_detail'] = lncRNA_gene['modification_detail'].str.replace('promoter hypermethylation','http://purl.obolibrary.org/obo/NCIT_C20102 (hypermethylation)')

# expression detail
lncRNA_gene['expression_detail'] = lncRNA_gene['expression_detail'].str.replace('Down-regulated','http://purl.obolibrary.org/obo/OMIT_0016265 (down-regulation)')
lncRNA_gene['expression_detail'] = lncRNA_gene['expression_detail'].str.replace('Up-regulated','http://purl.obolibrary.org/obo/OMIT_0016489 (up-regulation)')
lncRNA_gene['expression_detail'] = lncRNA_gene['expression_detail'].str.replace('Differentially expressed','http://purl.obolibrary.org/obo/OBI_0002584 (differential expression analysis data)')

# target interaction
lncRNA_gene['target_interaction'] = lncRNA_gene['target_interaction'].str.replace(';', '|')
lncRNA_gene['target_interaction'] = lncRNA_gene['target_interaction'].str.replace('Protein-DNA', 'http://purl.obolibrary.org/obo/NCIT_C18755 (dna-protein interaction)')
lncRNA_gene['target_interaction'] = lncRNA_gene['target_interaction'].str.replace('Protein-RNA', 'http://purl.obolibrary.org/obo/NCIT_C19019 (rna-protein interaction)')
lncRNA_gene['target_interaction'] = lncRNA_gene['target_interaction'].str.replace('RNA-Protein', 'http://purl.obolibrary.org/obo/NCIT_C19019 (rna-protein interaction)')
lncRNA_gene['target_interaction'] = lncRNA_gene['target_interaction'].str.replace('RNA-RNA', 'http://purl.obolibrary.org/obo/FBcv_0003103 (rna-rna interaction)')

# clinical detail
lncRNA_gene['clinical_detail'] = lncRNA_gene['clinical_detail'].str.replace(';', '|')
lncRNA_gene['clinical_detail'] = lncRNA_gene['clinical_detail'].str.replace('recurrence', 'http://purl.obolibrary.org/obo/NCIT_C3352 (recurrence)')
lncRNA_gene['clinical_detail'] = lncRNA_gene['clinical_detail'].str.replace('prognosis', 'http://purl.obolibrary.org/obo/OGMS_0000093 (prognosis)')
lncRNA_gene['clinical_detail'] = lncRNA_gene['clinical_detail'].str.replace('survival', 'http://purl.obolibrary.org/obo/NCIT_C17177 (survival)')
lncRNA_gene['clinical_detail'] = lncRNA_gene['clinical_detail'].str.replace('metastasis', 'http://purl.obolibrary.org/obo/NCIT_C19151 (metastasis)')
lncRNA_gene['clinical_detail'] = lncRNA_gene['clinical_detail'].str.replace('drug', 'http://purl.obolibrary.org/obo/CHEBI_23888 (drug)')

# disease
lncRNA_gene.context_detail = [replace_with_mondo(item) for item in lncRNA_gene.context_detail]

# tissue/cell line
lncRNA_gene['tissue/cell line'] = lncRNA_gene['tissue/cell line'].str.lower()
lncRNA_gene['tissue/cell line'] = lncRNA_gene['tissue/cell line'].str.replace(';', '|')
lncRNA_gene['tissue/cell line'] = [replace_with_clo(item) for item in lncRNA_gene['tissue/cell line']]
lncRNA_gene['tissue/cell line'] = [replace_with_clo2(item) for item in lncRNA_gene['tissue/cell line']]
lncRNA_gene['tissue/cell line'] = [replace_with_uberon(item) for item in lncRNA_gene['tissue/cell line']]

# experimental method
lncRNA_gene['experimental_method'] = lncRNA_gene['experimental_method'].str.replace('wetern', 'western')
lncRNA_gene['experimental_method'] = lncRNA_gene['experimental_method'].str.replace('assay assay', 'assay')
lncRNA_gene['experimental_method'] = lncRNA_gene['experimental_method'].str.lower()
lncRNA_gene['experimental_method'] = lncRNA_gene['experimental_method'].str.replace(';', '|')
lncRNA_gene['experimental_method'] = [replace_with_ncit(item) for item in lncRNA_gene['experimental_method']]

# biological process
lncRNA_gene['biological_process'] = lncRNA_gene['biological_process'].str.lower()
lncRNA_gene['biological_process'] = lncRNA_gene['biological_process'].str.replace(';', '|')
lncRNA_gene['biological_process'] = [replace_with_go(item) for item in lncRNA_gene['biological_process']]

# pathway
lncRNA_gene['pathway'] = lncRNA_gene['pathway'].str.lower()
lncRNA_gene['pathway'] = lncRNA_gene['pathway'].str.replace(';', '|')
lncRNA_gene['pathway'] = lncRNA_gene['pathway'].str.replace('nf-??b', 'nuclear factor kappa B')
lncRNA_gene['pathway'] = lncRNA_gene['pathway'].str.replace('/??-catenin', '')
lncRNA_gene['pathway'] = lncRNA_gene['pathway'].str.replace('pi3k/akt ', '')
lncRNA_gene['pathway'] = [replace_with_pw(item) for item in lncRNA_gene['pathway']]

# drug
lncRNA_gene['drug'] = lncRNA_gene['drug'].str.lower()
lncRNA_gene['drug'] = lncRNA_gene['drug'].str.replace(';', '|')
lncRNA_gene['drug'] = [replace_with_drugbank(item) for item in lncRNA_gene['drug']]

In [None]:
lncRNA_gene.columns = lncRNA_gene.columns.str.replace('_', ' ').str.capitalize()
lncRNA_gene = lncRNA_gene.rename(columns={'0 y y':'lncRNA', '0 y x':'Gene', 'Transcript id': 'Transcript ID',
                                          'Pmid':'References (PMID)', '1 y': 'Regulator'})

lncRNA_gene.insert(len(lncRNA_gene.columns)-1,'Source(s)',lncRNA_gene.pop('Source(s)'))


In [None]:
merge_rows(lncRNA_gene,'lncRNA','Gene').to_csv(edge_data_location + 'RlncRNA-gene.txt', sep='\t', index=None)

***
### lncRNA-disease - http://purl.obolibrary.org/obo/RO_0003302 (causes or contributes to condition)

* [LncRNADisease](http://www.rnanut.net/lncrnadisease/) <br /> LncRNADisease integrates comprehensive experimentally supported and predicted ncRNA-disease associations curated from manual literatures and other resources.

In [None]:
data_downloader('http://www.rnanut.net/lncrnadisease/static/download/experimental%20lncRNA-disease%20information.xlsx', unprocessed_data_location)

In [None]:
lncRNA_disease = pd.read_excel(unprocessed_data_location + 'experimental%20lncRNA-disease%20information.xlsx')  
# We keep only rows dealing with HS
lncRNA_disease = lncRNA_disease[lncRNA_disease['Species'].str.contains("sapiens")]
lncRNA_disease.drop(columns=['ncRNA Category','Species'],inplace=True)
lncRNA_disease['Source(s)'] = 'LncRNADisease'
lncRNA_disease

In [None]:
lncRNA_disease = lncRNA_disease.rename(columns={"Disease Name": "desc"})
lncRNA_disease['desc'] = lncRNA_disease['desc'].str.lower()
lncRNA_disease = pd.merge(desc_disPhe_map.rename(columns={0:'desc'}), lncRNA_disease, on=['desc'])
lncRNA_disease.drop(columns=['desc'],inplace=True)
lncRNA_disease

***
* [Lnc2Cancer](http://bio-bigdata.hrbmu.edu.cn/lnc2cancer/index.html) <br /> Lnc2Cancer is a manually curated database that provides comprehensive experimentally supported associations between lncRNA or circRNA and human cancer.

In [None]:
! wget http://bio-bigdata.hrbmu.edu.cn/lnc2cancer/download/lncRNA.xlsx

In [None]:
lncRNA_disease2 = pd.read_excel(unprocessed_data_location+'lncRNA.xlsx')  
lncRNA_disease2 = pd.merge(desc_disPhe_map.rename(columns={0:'desc'}), lncRNA_disease2, left_on=['desc'], right_on=['cancer type'])
lncRNA_disease2.drop(columns=['desc','cancer type'],inplace=True)
lncRNA_disease2.rename(columns={'name':'ncRNA Symbol'},inplace=True)
lncRNA_disease2['Source(s)'] = 'Lnc2Cancer'
lncRNA_disease2

In [None]:
lncRNA_disease = pd.merge(lncRNA_disease, lncRNA_disease2, how='outer', on=[1,'ncRNA Symbol'])

lncRNA_disease['Sample'].fillna(lncRNA_disease['sample'], inplace=True)
lncRNA_disease['Sample'] = lncRNA_disease['Sample'].str.lower()
lncRNA_disease['sample'] = lncRNA_disease['sample'].str.lower()
lncRNA_disease.loc[(lncRNA_disease['Sample'].notna()) & (lncRNA_disease['sample'].notna()) &
                 (lncRNA_disease['Sample'] != lncRNA_disease['sample']),
                 ["Sample"]] = lncRNA_disease["Sample"] + '|' + lncRNA_disease['sample']

lncRNA_disease['Dysfunction Pattern'].fillna(lncRNA_disease['regulated'], inplace=True)
lncRNA_disease.loc[(lncRNA_disease['Dysfunction Pattern'].notna()) & (lncRNA_disease['regulated'].notna()) &
                 (lncRNA_disease['Dysfunction Pattern'] != lncRNA_disease['regulated']),
                 ["Dysfunction Pattern"]] = lncRNA_disease["Dysfunction Pattern"] + '|' + lncRNA_disease['regulated']

lncRNA_disease['PubMed ID'] = lncRNA_disease['PubMed ID'].astype(str)
lncRNA_disease['pubmed id'] = lncRNA_disease['pubmed id'].astype(str)
lncRNA_disease['PubMed ID'].fillna(lncRNA_disease['pubmed id'], inplace=True)
lncRNA_disease.loc[(~lncRNA_disease['PubMed ID'].isna()) & (~lncRNA_disease['pubmed id'].isna()) &
                 (lncRNA_disease['PubMed ID'] != lncRNA_disease['pubmed id']),
                 ["PubMed ID"]] = lncRNA_disease["PubMed ID"] + '|' + lncRNA_disease['pubmed id']

lncRNA_disease['Validated Method'].fillna(lncRNA_disease['methods'], inplace=True)
lncRNA_disease.loc[(lncRNA_disease['Validated Method'].notna()) & (lncRNA_disease['methods'].notna()) &
                 (lncRNA_disease['Validated Method'] != lncRNA_disease['methods']),
                 ["Validated Method"]] = lncRNA_disease["Validated Method"] + ' + ' + lncRNA_disease['methods']

lncRNA_disease['Source(s)_x'] = lncRNA_disease['Source(s)_x'].astype(str)
lncRNA_disease['Source(s)_y'] = lncRNA_disease['Source(s)_y'].astype(str)
lncRNA_disease['Source(s)'] = lncRNA_disease['Source(s)_x'] + '|' + lncRNA_disease['Source(s)_y']
lncRNA_disease = lncRNA_disease.drop(columns=['Source(s)_x', 'Source(s)_y'])

lncRNA_disease.drop(columns=['sample', 'regulated','methods', 'year', 'title', 'pubmed id'],inplace=True)

lncRNA_disease

***
* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
lncRNA_disease2 = LncRNAWiki[LncRNAWiki['biological_context'].notna()]
lncRNA_disease2 = lncRNA_disease2[lncRNA_disease2.biological_context.str.contains('isease')]
lncRNA_disease2 = lncRNA_disease2.drop(columns=['synonyms', 'gene_locus'])
lncRNA_disease2.rename(columns={'context_detail':'desc','symbol':'ncRNA Symbol'},inplace=True)
lncRNA_disease2['Source(s)'] = 'LncRNAWiki' 
lncRNA_disease2

In [None]:
lncRNA_disease2 = pd.merge(lncRNA_disease2, desc_disPhe_map.rename(columns={0:'desc'}), on='desc')
lncRNA_disease2.drop(columns='desc',inplace=True)
lncRNA_disease = pd.merge(lncRNA_disease, lncRNA_disease2, how='outer', on=[1,'ncRNA Symbol'])

lncRNA_disease['Sample'].fillna(lncRNA_disease['tissue/cell line'], inplace=True)
lncRNA_disease.loc[(lncRNA_disease['Sample'].notna()) & (lncRNA_disease['tissue/cell line'].notna()) &
                 (lncRNA_disease['Sample'] != lncRNA_disease['tissue/cell line']),
                 ["Sample"]] = lncRNA_disease["Sample"] + '|' + lncRNA_disease['tissue/cell line']

lncRNA_disease.drop(columns=['tissue/cell line'],inplace=True)

lncRNA_disease['PubMed ID'] = lncRNA_disease['PubMed ID'].astype(str)
lncRNA_disease['PubMed ID'].fillna(lncRNA_disease['pmid'], inplace=True)
lncRNA_disease.loc[(lncRNA_disease['PubMed ID'].notna()) & (lncRNA_disease['pmid'].notna()) &
                 (lncRNA_disease['PubMed ID'] != lncRNA_disease['pmid']),
                 ["PubMed ID"]] = lncRNA_disease["PubMed ID"] + '|' + lncRNA_disease['pmid']

lncRNA_disease['Source(s)_x'] = lncRNA_disease['Source(s)_x'].astype(str)
lncRNA_disease['Source(s)_y'] = lncRNA_disease['Source(s)_y'].astype(str)
lncRNA_disease['Source(s)'] = lncRNA_disease['Source(s)_x'] + '|' + lncRNA_disease['Source(s)_y']
lncRNA_disease = lncRNA_disease.drop(columns=['Source(s)_x', 'Source(s)_y'])

lncRNA_disease.drop(columns=['pmid'],inplace=True)

lncRNA_disease

***
* [LncBook](https://ngdc.cncb.ac.cn/lncbook/)

In [None]:
! wget https://ngdc.cncb.ac.cn/lncbook/files/variation_LncBook2.0.csv.gz

In [None]:
lncRNA_disease2 = pd.read_csv(unprocessed_data_location+'variation_LncBook2.0.csv.gz')
lncRNA_disease2 = lncRNA_disease2[lncRNA_disease2['COSMIC Tumor Name'] != '-']
lncRNA_disease2 = lncRNA_disease2[lncRNA_disease2['Symbol'] != '-']
lncRNA_disease2['COSMIC Tumor Name'] = lncRNA_disease2['COSMIC Tumor Name'].str.split(';')
lncRNA_disease2 = lncRNA_disease2.explode('COSMIC Tumor Name')
lncRNA_disease2['COSMIC Tumor Name'] = lncRNA_disease2['COSMIC Tumor Name'].str.replace(r"\(.*?\)", "", regex=True)
lncRNA_disease2['COSMIC Tumor Name'] = [desc[1:] if desc.startswith(' ') else
                                       desc for desc in lncRNA_disease2['COSMIC Tumor Name']]
lncRNA_disease2['COSMIC Tumor Name'] = lncRNA_disease2['COSMIC Tumor Name'].str.lower()
lncRNA_disease2.rename(columns={'COSMIC Tumor Name':'desc','Symbol':'ncRNA Symbol'},inplace=True)
lncRNA_disease2['ClinVar Allele ID'] = lncRNA_disease2['ClinVar Allele ID'].replace('-', np.nan)
lncRNA_disease2['ClinVar Variation Effect'] = lncRNA_disease2['ClinVar Variation Effect'].replace('-', np.nan)
lncRNA_disease2['ClinVar Disease Name'] = lncRNA_disease2['ClinVar Disease Name'].replace('-', np.nan)
lncRNA_disease2['Source(s)'] = 'LncBook' 
lncRNA_disease2

In [None]:
lncRNA_disease2['desc'].unique()

In [None]:
term_mapping = {
    'liver carcinoma': 'MONDO_0007256',
    'oesophagus carcinoma': 'MONDO_0019086',
    'breast carcinoma': 'MONDO_0004989',
    'lung carcinoma': 'MONDO_0005138',
    'haematopoietic and lymphoid tissue carcinoma': 'MONDO_0017348',
    'prostate carcinoma': 'MONDO_0005159',
    'large intestine carcinoma': 'MONDO_0024331',
    'skin carcinoma': 'MONDO_0002656',
    'pancreas carcinoma': 'MONDO_0006047',
    'central nervous system carcinoma': 'MONDO:0006130',
    'biliary tract carcinoma': 'MONDO_0003707',
    'endometrium carcinoma': 'MONDO_0005461',
    'ovary carcinoma': 'MONDO_0005140',
    'kidney carcinoma': 'MONDO_0005206',
    'urinary tract carcinoma': 'MONDO_0040679',
    'cervix carcinoma': 'MONDO_0005131',
    'soft tissue carcinoma': 'MONDO_0006424',
    'stomach carcinoma': 'MONDO_0004950',
    'bone carcinoma': 'MONDO_0002415',
    'small intestine carcinoma': 'MONDO_0005522',
    'thyroid carcinoma': 'MONDO_0015075',
    'upper aerodigestive tract carcinoma': 'MONDO_0005398',
    'placenta carcinoma': 'MONDO_0002178',
    'salivary gland carcinoma': 'MONDO_0000521',
    'adrenal gland carcinoma': 'MONDO_0002814',
    'autonomic ganglia carcinoma': 'MONDO_0003996',
    'meninges carcinoma': 'MONDO_0021322',
    'eye carcinoma': 'MONDO_0002466',
    'genital tract carcinoma': 'MONDO_0005140',
    'pleura carcinoma': 'MONDO_0006294',
    'parathyroid carcinoma': 'MONDO_0012004',
    'thymus carcinoma': 'MONDO_0006451',
    'pituitary carcinoma': 'MONDO_0017582',
    'testis carcinoma': 'MONDO_0005447',
    'peritoneum carcinoma': 'MONDO_0002113',
    'uterine adnexa carcinoma': 'MONDO_0001351',
    'gastrointestinal tract carcinoma': 'MONDO_0006181',
    'fallopian tube carcinoma': 'MONDO_0006206',
    'penis carcinoma': 'MONDO_0006360',
    'vulva carcinoma': 'MONDO_0005215',
    'ns': np.nan
}

lncRNA_disease2['desc'] = lncRNA_disease2['desc'].map(term_mapping)

In [None]:
lncRNA_disease = lncRNA_disease.rename(columns={1:'desc'})

lncRNA_disease = pd.merge(lncRNA_disease, lncRNA_disease2, how='outer', on=['desc','ncRNA Symbol'])
lncRNA_disease = pd.merge(lncRNA_disease, symbol_entrez_map.rename(columns={'0_x':'ncRNA Symbol'}), on='ncRNA Symbol')
lncRNA_disease.insert(0,'0_y',lncRNA_disease.pop('0_y'))
lncRNA_disease['0_y'] = lncRNA_disease['0_y'].astype(str)+'?lncRNA'

lncRNA_disease['Source(s)_x'] = lncRNA_disease['Source(s)_x'].astype(str)
lncRNA_disease['Source(s)_y'] = lncRNA_disease['Source(s)_y'].astype(str)
lncRNA_disease['Source(s)'] = lncRNA_disease['Source(s)_x'] + '|' + lncRNA_disease['Source(s)_y']
lncRNA_disease = lncRNA_disease.drop(columns=['Source(s)_x', 'Source(s)_y', 'ncRNA Symbol', 'gene_id',
                                              'Gene ID', 'molecular_function', 'target_interaction'])

lncRNA_disease

In [None]:
#lncRNA_disease2 = lncRNA_disease.copy()
lncRNA_disease = lncRNA_disease2.copy()

Manual fix of inconsistencies.

In [None]:
lncRNA_disease['PubMed ID'] = lncRNA_disease['PubMed ID'].str.replace('nan\||\|nan', '', regex=True)
lncRNA_disease['PubMed ID'] = lncRNA_disease['PubMed ID'].str.replace('.0', '', regex=True)
lncRNA_disease['PubMed ID'] = lncRNA_disease['PubMed ID'].str.replace('|', '|https://pubmed.ncbi.nlm.nih.gov/')
lncRNA_disease['PubMed ID'] = 'https://pubmed.ncbi.nlm.nih.gov/' + lncRNA_disease['PubMed ID'].astype('Int64', errors='ignore').astype('str')
lncRNA_disease['PubMed ID'] = lncRNA_disease['PubMed ID'].replace('https://pubmed.ncbi.nlm.nih.gov/nan', np.nan)

lncRNA_disease['Dbsnp id'] = 'https://www.ncbi.nlm.nih.gov/snp/' + lncRNA_disease['Dbsnp id'].astype('str')
lncRNA_disease['Dbsnp id'] = lncRNA_disease['Dbsnp id'].replace('https://www.ncbi.nlm.nih.gov/snp/nan', np.nan)

lncRNA_disease['Source(s)'] = lncRNA_disease['Source(s)'].str.replace('nan\||\|nan', '', regex=True)

lncRNA_disease['transcript_id'] = lncRNA_disease['transcript_id'].str.replace(', ', '|')

lncRNA_disease.loc[(lncRNA_disease["Description"].notna()) &
                   (lncRNA_disease["function description"].notna()),
                   ["Description"]] = lncRNA_disease['Description'] + '|' + lncRNA_disease['function description']
lncRNA_disease = lncRNA_disease.drop(columns=['function description'])

lncRNA_disease.conservation_ortholog = lncRNA_disease.conservation_ortholog.str.replace('Human;7SL', 'https://www.ncbi.nlm.nih.gov/gene/6029')
lncRNA_disease.conservation_ortholog = lncRNA_disease.conservation_ortholog.str.replace('Mouse', 'http://purl.obolibrary.org/obo/NCBITaxon_10090 (mus musculus)')

lncRNA_disease.biological_context = lncRNA_disease.biological_context.str.replace('Disease', 'http://purl.obolibrary.org/obo/MONDO_0000001 (disease)')

lncRNA_disease['epigenetic_modification'] = lncRNA_disease['epigenetic_modification'].str.replace('DNA methylation','http://purl.obolibrary.org/obo/GO_0006306 (dna methylation)')
lncRNA_disease['epigenetic_modification'] = lncRNA_disease['epigenetic_modification'].str.replace('Histone modification','http://purl.obolibrary.org/obo/GO_0016570 (histone modification)')
lncRNA_disease['epigenetic_modification'] = lncRNA_disease['epigenetic_modification'].str.replace('Epigenetic change','http://purl.obolibrary.org/obo/NCIT_C21051 (epigenetic process)')

lncRNA_disease['variation_detail'] = lncRNA_disease['variation_detail'].str.replace('SNP(', '')
lncRNA_disease['variation_detail'] = lncRNA_disease['variation_detail'].str.replace(')', '')
lncRNA_disease['variation_detail'] = lncRNA_disease['variation_detail'].str.replace(';', '|')

lncRNA_disease['modification_detail'] = lncRNA_disease['modification_detail'].str.replace(';', '|')

lncRNA_disease['expression'] = lncRNA_disease['expression'].str.replace(
    'RNA', 'http://purl.obolibrary.org/obo/SO_0000356 (RNA)')

lncRNA_disease['regulator_type'] = lncRNA_disease['regulator_type'].str.replace(';', '|')
lncRNA_disease['regulator_type'] = lncRNA_disease['regulator_type'].str.replace('Protein', 'http://purl.obolibrary.org/obo/PR_000000001 (protein)')
lncRNA_disease['regulator_type'] = lncRNA_disease['regulator_type'].str.replace('TF', 'http://purl.obolibrary.org/obo/NCIT_C17207 (transcription factor)')
lncRNA_disease['regulator_type'] = lncRNA_disease['regulator_type'].str.replace('PCG', 'http://purl.obolibrary.org/obo/SO_0001217 (protein_coding_gene)')

lncRNA_disease['regulator_interaction'] = lncRNA_disease['regulator_interaction'].replace('NA;NA', np.nan)
lncRNA_disease['regulator_interaction'] = lncRNA_disease['regulator_interaction'].str.replace('Protein-DNA', 'http://purl.obolibrary.org/obo/NCIT_C18755 (dna-protein interaction)')
lncRNA_disease['regulator_interaction'] = lncRNA_disease['regulator_interaction'].str.replace('Protein-RNA', 'http://purl.obolibrary.org/obo/NCIT_C19019 (rna-protein interaction)')

lncRNA_disease['regulator_effect'] = lncRNA_disease['regulator_effect'].str.replace(';', '|')
lncRNA_disease['regulator_effect'] = lncRNA_disease['regulator_effect'].str.replace('promote', 'http://purl.obolibrary.org/obo/NCIT_C61391 (promotion)')
lncRNA_disease['regulator_effect'] = lncRNA_disease['regulator_effect'].str.replace('inhibit', 'http://purl.obolibrary.org/obo/NCIT_C42791 (inhibition)')

lncRNA_disease['target_effect'] = lncRNA_disease['target_effect'].str.replace(';', '|')
lncRNA_disease['target_effect'] = lncRNA_disease['target_effect'].str.replace('promote', 'http://purl.obolibrary.org/obo/NCIT_C61391 (promotion)')
lncRNA_disease['target_effect'] = lncRNA_disease['target_effect'].str.replace('inhibit', 'http://purl.obolibrary.org/obo/NCIT_C42791 (inhibition)')

lncRNA_disease['target_type'] = lncRNA_disease['target_type'].str.replace(';', '|')
lncRNA_disease['target_type'] = lncRNA_disease['target_type'].str.replace('miRNA', 'http://purl.obolibrary.org/obo/SO_0000276 (mirna)')
lncRNA_disease['target_type'] = lncRNA_disease['target_type'].str.replace('TF', 'http://purl.obolibrary.org/obo/NCIT_C17207 (transcription factor)')
lncRNA_disease['target_type'] = lncRNA_disease['target_type'].str.replace('PCG', 'http://purl.obolibrary.org/obo/SO_0001217 (protein_coding_gene)')
lncRNA_disease['target_type'] = lncRNA_disease['target_type'].str.replace('lncRNA', 'http://purl.obolibrary.org/obo/SO_0001877 (lncrna)')

lncRNA_disease['functional_mechanism'] = lncRNA_disease['functional_mechanism'].str.replace(';', '|')
lncRNA_disease['functional_mechanism'] = lncRNA_disease['functional_mechanism'].str.replace('Post-transcriptional regulation', 'http://purl.obolibrary.org/obo/NCIT_C18952 (post-transcriptional regulation)')
lncRNA_disease['functional_mechanism'] = lncRNA_disease['functional_mechanism'].str.replace('Transcriptional regulation', 'http://purl.obolibrary.org/obo/NCIT_C19077 (transcriptional regulation)')
lncRNA_disease['functional_mechanism'] = lncRNA_disease['functional_mechanism'].str.replace('Epigenetic regulation', 'http://purl.obolibrary.org/obo/GO_0040029 (epigenetic regulation of gene expression)')

lncRNA_disease['clinical_detail'] = lncRNA_disease['clinical_detail'].str.replace(';', '|')
lncRNA_disease['clinical_detail'] = lncRNA_disease['clinical_detail'].str.replace('recurrence', 'http://purl.obolibrary.org/obo/NCIT_C3352 (recurrence)')
lncRNA_disease['clinical_detail'] = lncRNA_disease['clinical_detail'].str.replace('prognosis', 'http://purl.obolibrary.org/obo/OGMS_0000093 (prognosis)')
lncRNA_disease['clinical_detail'] = lncRNA_disease['clinical_detail'].str.replace('survival', 'http://purl.obolibrary.org/obo/NCIT_C17177 (survival)')
lncRNA_disease['clinical_detail'] = lncRNA_disease['clinical_detail'].str.replace('metastasis', 'http://purl.obolibrary.org/obo/NCIT_C19151 (metastasis)')
lncRNA_disease['clinical_detail'] = lncRNA_disease['clinical_detail'].str.replace('drug', 'http://purl.obolibrary.org/obo/CHEBI_23888 (drug)')

lncRNA_disease['biological_process'] = lncRNA_disease['biological_process'].str.lower()
lncRNA_disease['biological_process'] = lncRNA_disease['biological_process'].str.replace(';', '|')
lncRNA_disease['biological_process'] = [replace_with_go(item) for item in lncRNA_disease['biological_process']]

lncRNA_disease['pathway'] = lncRNA_disease['pathway'].str.lower()
lncRNA_disease['pathway'] = lncRNA_disease['pathway'].str.replace(';', '|')
lncRNA_disease['pathway'] = lncRNA_disease['pathway'].str.replace('nf-??b', 'nuclear factor kappa B')
lncRNA_disease['pathway'] = lncRNA_disease['pathway'].str.replace('/??-catenin', '')
lncRNA_disease['pathway'] = lncRNA_disease['pathway'].str.replace('pi3k/akt ', '')
lncRNA_disease['pathway'] = [replace_with_pw(item) for item in lncRNA_disease['pathway']]

lncRNA_disease['drug'] = lncRNA_disease['drug'].str.lower()
lncRNA_disease['drug'] = lncRNA_disease['drug'].str.replace(';', '|')
lncRNA_disease['drug'] = [replace_with_drugbank(item) for item in lncRNA_disease['drug']]

lncRNA_disease['regulator'] = lncRNA_disease['regulator'].str.replace(';', '|').str.lower()
lncRNA_disease['regulator'] = [replace_with_pro(item) for item in lncRNA_disease['regulator']]

lncRNA_disease['ClinVar Disease Name'] = lncRNA_disease['ClinVar Disease Name'].replace('-', np.nan)

lncRNA_disease['genome_variation'] = lncRNA_disease['genome_variation'].str.replace('Mutation', 'http://purl.obolibrary.org/obo/OMIT_0010192 (mutation)')

lncRNA_disease['dbSNP ID'] = lncRNA_disease['dbSNP ID'].replace('-', np.nan)

lncRNA_disease['COSMIC Variation Effect'] = lncRNA_disease['COSMIC Variation Effect'].str.replace('Pathogenic', 'http://purl.obolibrary.org/obo/NCIT_C168799 (pathogenic variant)')

- Dysfunction pattern

In [None]:
lncRNA_disease.loc[(lncRNA_disease["Dysfunction Pattern"].notna()) &
                   (lncRNA_disease["expression_detail"].notna()),
                   ["Dysfunction Pattern"]] = lncRNA_disease['Dysfunction Pattern'] + '|' + lncRNA_disease['expression_detail']
lncRNA_disease = lncRNA_disease.drop(columns=['expression_detail'])

mirna_dict = dict(zip(miRBaseMap[2], miRBaseMap[0]))

def replace_with_mirbase(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([mirna_dict.get(part, part) for part in substring.split('|')])

def extract_pattern(value):
    if pd.notna(value) and '[' in value and ']' in value:
        return pd.Series([value.split('[')[1]])
    else:
        return pd.Series([value])

lncRNA_disease['Dysfunction Pattern'] = lncRNA_disease['Dysfunction Pattern'].str.replace('/', '+')
lncRNA_disease['Dysfunction Pattern'] = lncRNA_disease['Dysfunction Pattern'].apply(extract_pattern)
lncRNA_disease['Dysfunction Pattern'] = lncRNA_disease['Dysfunction Pattern'].squeeze()

lncRNA_disease['Dysfunction Pattern'] = [replace_with_mirbase(item) for item in lncRNA_disease['Dysfunction Pattern']]
lncRNA_disease['Dysfunction Pattern'] = lncRNA_disease['Dysfunction Pattern'].str.replace('miR', 'hsa-miR')
lncRNA_disease['Dysfunction Pattern'] = [replace_with_mirbase(item) for item in lncRNA_disease['Dysfunction Pattern']]

lncRNA_disease['Dysfunction Pattern'] = lncRNA_disease['Dysfunction Pattern'].str.replace('Down-regulated','http://purl.obolibrary.org/obo/OMIT_0016265 (down-regulation)')
lncRNA_disease['Dysfunction Pattern'] = lncRNA_disease['Dysfunction Pattern'].str.replace('Up-regulated','http://purl.obolibrary.org/obo/OMIT_0016489 (up-regulation)')
lncRNA_disease['Dysfunction Pattern'] = lncRNA_disease['Dysfunction Pattern'].str.replace('Differentially expressed','http://purl.obolibrary.org/obo/OBI_0002584 (differental expression)')
lncRNA_disease['Dysfunction Pattern'] = lncRNA_disease['Dysfunction Pattern'].str.replace('Mutation','http://purl.obolibrary.org/obo/OMIT_0010192 (mutation)')

lncRNA_disease['Dysfunction Pattern'] = lncRNA_disease['Dysfunction Pattern'].str.replace(']', '')
lncRNA_disease['Dysfunction Pattern'] = lncRNA_disease['Dysfunction Pattern'].str.replace('regulation', 'http://purl.obolibrary.org/obo/GO_0010468 (regulation of gene expression)')

- Validated method.

In [None]:
lncRNA_disease.loc[(lncRNA_disease["Validated Method"].notna()) &
                   (lncRNA_disease["experimental_method"].notna()),
                   ["Validated Method"]] = lncRNA_disease['Validated Method'] + '|' + lncRNA_disease['experimental_method']
lncRNA_disease = lncRNA_disease.drop(columns=['experimental_method'])

lncRNA_disease['Validated Method'] = lncRNA_disease['Validated Method'].str.replace('wetern', 'western')
lncRNA_disease['Validated Method'] = lncRNA_disease['Validated Method'].str.replace('assay assay', 'assay')
lncRNA_disease['Validated Method'] = lncRNA_disease['Validated Method'].str.lower()
lncRNA_disease['Validated Method'] = lncRNA_disease['Validated Method'].str.replace('//', '|')
lncRNA_disease['Validated Method'] = lncRNA_disease['Validated Method'].str.replace(', ', '|')
lncRNA_disease['Validated Method'] = [replace_with_ncit(item) for item in lncRNA_disease['Validated Method']]

- Sample.

In [None]:
uberon_dict2 = dict(zip(desc_uberon_map[0].str.replace(' tissue',''), 'http://purl.obolibrary.org/obo/' +
                        ' (' + desc_uberon_map[1] + ')'))

def replace_with_uberon2(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([uberon_dict2.get(part, part) for part in substring.split('|')])
    
lncRNA_disease.Sample = lncRNA_disease.Sample.str.replace(';', '|')
lncRNA_disease.Sample = lncRNA_disease.Sample.str.replace(',', '|')
lncRNA_disease.Sample = lncRNA_disease.Sample.str.replace('(', '')
lncRNA_disease.Sample = lncRNA_disease.Sample.str.replace(')', '')
lncRNA_disease.Sample = lncRNA_disease.Sample.str.replace(' as well as ', '|')
lncRNA_disease.Sample = lncRNA_disease.Sample.str.replace(' and ', '|')
lncRNA_disease.Sample = lncRNA_disease.Sample.str.replace(' |', '|')
lncRNA_disease.Sample = lncRNA_disease.Sample.str.replace('| ', '|')
lncRNA_disease.Sample = lncRNA_disease.Sample.str.replace(' | ', '|')
lncRNA_disease.Sample = lncRNA_disease.Sample.str.replace(', ', '|')
lncRNA_disease.Sample = [replace_with_clo(item) for item in lncRNA_disease.Sample]
lncRNA_disease.Sample = [replace_with_clo2(item) for item in lncRNA_disease.Sample]
lncRNA_disease.Sample = [replace_with_uberon(item) for item in lncRNA_disease.Sample]
lncRNA_disease.Sample = [replace_with_uberon2(item) for item in lncRNA_disease.Sample]
lncRNA_disease.Sample.unique()[:5]

- Target.

In [None]:
lncRNA_disease['target'] = lncRNA_disease['target'].str.replace('miR', 'hsa-miR')
lncRNA_disease['target'] = [replace_with_mirbase(item) for item in lncRNA_disease['target']]
lncRNA_disease['target'] = lncRNA_disease['target'].str.replace(';', '|').str.lower()
lncRNA_disease['target'] = [replace_with_pro(item) for item in lncRNA_disease['target']]
lncRNA_disease['target'].unique()[:5]

- ClinVar variation effect.

In [None]:
lncRNA_disease['ClinVar Variation Effect'] = lncRNA_disease['ClinVar Variation Effect'].str.replace('; ', '|').str.lower()
lncRNA_disease['ClinVar Variation Effect'] = [replace_with_mondo(item) for item in lncRNA_disease['ClinVar Variation Effect']]
lncRNA_disease['ClinVar Variation Effect'] = lncRNA_disease['ClinVar Variation Effect'].str.replace(r's$', '', regex=True)
lncRNA_disease['ClinVar Variation Effect'] = [replace_with_mondo(item) for item in lncRNA_disease['ClinVar Variation Effect']]
lncRNA_disease['ClinVar Variation Effect'].unique()[:5]

- GWAS trait.

In [None]:
lncRNA_disease['GWAS Trait'] = lncRNA_disease['GWAS Trait'].str.replace('; ', '|')
lncRNA_disease['GWAS Trait'] = lncRNA_disease['GWAS Trait'].str.replace('; ', '|').str.lower()
lncRNA_disease['GWAS Trait'] = [replace_with_mondo(item) for item in lncRNA_disease['GWAS Trait']]
lncRNA_disease['GWAS Trait'] = lncRNA_disease['GWAS Trait'].str.replace(r's$', '', regex=True)
lncRNA_disease['GWAS Trait'] = [replace_with_mondo(item) for item in lncRNA_disease['GWAS Trait']]

lncRNA_disease['GWAS Trait'] = [replace_with_ncit(item) for item in lncRNA_disease['GWAS Trait']]
lncRNA_disease['GWAS Trait'].unique()[:5]

In [None]:
lncRNA_disease.columns = lncRNA_disease.columns.str.replace('_', ' ').str.capitalize()
lncRNA_disease = lncRNA_disease.rename(columns={'0 y':'lncRNA', 'Pubmed id':'References (PMID)', 'Transcript id': 'Transcript ID',
                                        '1 y': 'Regulator', 'Desc':'Disease', 'Dbsnp id':'SNP'})

In [None]:
merge_rows(lncRNA_disease, "lncRNA", "Disease").to_csv(edge_data_location + 'RlncRNA-disease.txt', sep='\t', index=None)

***
### circRNA-disease - http://purl.obolibrary.org/obo/RO_0003302 (causes or contributes to condition)

* [LncRNADisease](http://www.rnanut.net/lncrnadisease/)

In [None]:
! wget http://www.rnanut.net/lncrnadisease/static/download/experimental%20circRNA-disease%20information.xlsx

In [None]:
circRNA_disease = pd.read_excel(unprocessed_data_location + 'experimental circRNA-disease information.xlsx')  
# Same reasoning of lncRNA-disease
circRNA_disease = circRNA_disease[circRNA_disease['Species'].str.contains("sapiens")]
circRNA_disease = circRNA_disease[circRNA_disease['ncRNA Category'] == 'circRNA']
circRNA_disease.drop(columns=['ncRNA Category','Species'],inplace=True)
circRNA_disease['Source(s)'] = 'LncRNADisease'
circRNA_disease

In [None]:
circRNA_disease = circRNA_disease.rename(columns={"Disease Name": "desc"})
circRNA_disease['desc'] = circRNA_disease['desc'].str.lower()
circRNA_disease = pd.merge(desc_disPhe_map.rename(columns={0:'desc'}), circRNA_disease, on=['desc'])
circRNA_disease.drop(columns=['desc'],inplace=True)
circRNA_disease

***
* [Lnc2Cancer](http://bio-bigdata.hrbmu.edu.cn/lnc2cancer/index.html)

In [None]:
! wget http://bio-bigdata.hrbmu.edu.cn/lnc2cancer/download/circRNA.xlsx

In [None]:
circRNA_disease2 = pd.read_excel(unprocessed_data_location+'circRNA.xlsx')  
circRNA_disease2 = pd.merge(desc_disPhe_map.rename(columns={0:'desc'}),
                            circRNA_disease2, left_on=['desc'], right_on=['cancer type'])
circRNA_disease2.drop(columns=['desc','cancer type'],inplace=True)
circRNA_disease2.rename(columns={'name':'ncRNA Symbol'},inplace=True)
circRNA_disease2['Source(s)'] = 'Lnc2Cancer'
circRNA_disease2

In [None]:
circRNA_disease = pd.merge(circRNA_disease, circRNA_disease2, how='outer', on=[1,'ncRNA Symbol'])

circRNA_disease['Sample'].fillna(circRNA_disease['sample'], inplace=True)
circRNA_disease.loc[(circRNA_disease['Sample'].notna()) & (circRNA_disease['sample'].notna()) &
                 (circRNA_disease['Sample'] != circRNA_disease['sample']),
                 ["Sample"]] = circRNA_disease["Sample"] + '|' + circRNA_disease['sample']

circRNA_disease.drop(columns=['sample'],inplace=True)

circRNA_disease['Dysfunction Pattern'].fillna(circRNA_disease['regulated'], inplace=True)
circRNA_disease.loc[(circRNA_disease['Dysfunction Pattern'].notna()) & (circRNA_disease['regulated'].notna()) &
                 (circRNA_disease['Dysfunction Pattern'] != circRNA_disease['regulated']),
                 ["Dysfunction Pattern"]] = circRNA_disease["Dysfunction Pattern"] + '|' + circRNA_disease['regulated']

circRNA_disease.drop(columns=['regulated'],inplace=True)

circRNA_disease['Validated Method'].fillna(circRNA_disease['methods'], inplace=True)
circRNA_disease.loc[(circRNA_disease['Validated Method'].notna()) & (circRNA_disease['methods'].notna()) &
                 (circRNA_disease['Validated Method'] != circRNA_disease['methods']),
                 ["Validated Method"]] = circRNA_disease["Validated Method"] + '|' + circRNA_disease['methods']

circRNA_disease['Source(s)_x'] = circRNA_disease['Source(s)_x'].astype(str)
circRNA_disease['Source(s)_y'] = circRNA_disease['Source(s)_y'].astype(str)
circRNA_disease['Source(s)'] = circRNA_disease['Source(s)_x'] + '|' + circRNA_disease['Source(s)_y']
circRNA_disease = circRNA_disease.drop(columns=['Source(s)_x', 'Source(s)_y'])

circRNA_disease.drop(columns=['methods'],inplace=True)

circRNA_disease['PubMed ID'] = circRNA_disease['PubMed ID'].astype(str)
circRNA_disease['PubMed ID'].fillna(circRNA_disease['pubmed id'], inplace=True)
circRNA_disease.loc[(circRNA_disease['PubMed ID'].notna()) & (circRNA_disease['pubmed id'].notna()) &
                 (circRNA_disease['PubMed ID'] != circRNA_disease['pubmed id']),
                 ["PubMed ID"]] = circRNA_disease["PubMed ID"].astype(str) + '|' + circRNA_disease['pubmed id'].astype(str)

circRNA_disease.drop(columns=['pubmed id', 'function description',
                             'year', 'title'],inplace=True)

circRNA_disease['ncRNA Symbol'] = circRNA_disease['ncRNA Symbol'].str.replace('circ-', '')
circRNA_disease['ncRNA Symbol'] = circRNA_disease['ncRNA Symbol'].str.replace('circ', '')
circRNA_disease['ncRNA Symbol'] = circRNA_disease['ncRNA Symbol'].str.replace('circ_', '')
circRNA_disease

In [None]:
circRNA_disease = pd.merge(circRNA_disease, symbol_entrez_map.rename(columns={'0_x':'ncRNA Symbol'}), on='ncRNA Symbol')
circRNA_disease.insert(0,'0_y',circRNA_disease.pop('0_y'))
circRNA_disease['0_y'] = circRNA_disease['0_y'].astype(str)+'?circRNA'
circRNA_disease

Manual fix of inconsistencies.

In [None]:
circRNA_disease['PubMed ID'] = circRNA_disease['PubMed ID'].str.replace('nan\||\|nan', '', regex=True)
circRNA_disease['PubMed ID'] = circRNA_disease['PubMed ID'].str.replace('.0', '', regex=True)
circRNA_disease['PubMed ID'] = circRNA_disease['PubMed ID'].str.replace('|', '|https://pubmed.ncbi.nlm.nih.gov/')
circRNA_disease['PubMed ID'] = 'https://pubmed.ncbi.nlm.nih.gov/' + circRNA_disease['PubMed ID'].astype('Int64', errors='ignore').astype('str')
circRNA_disease['PubMed ID'] = circRNA_disease['PubMed ID'].replace('https://pubmed.ncbi.nlm.nih.gov/nan', np.nan)

circRNA_disease.drop(columns=['ncRNA Symbol'],inplace=True)
circRNA_disease['Source(s)'] = circRNA_disease['Source(s)'].str.replace('nan\||\|nan', '', regex=True)
circRNA_disease['PubMed ID'] = circRNA_disease['PubMed ID'].str.replace('nan\||\|nan', '', regex=True)

- Dysfunction pattern.

In [None]:
circRNA_disease['Dysfunction Pattern'] = circRNA_disease['Dysfunction Pattern'].apply(extract_pattern)
circRNA_disease['Dysfunction Pattern'] = circRNA_disease['Dysfunction Pattern'].squeeze()

circRNA_disease['Dysfunction Pattern'] = circRNA_disease['Dysfunction Pattern'].str.replace('down-regulated','http://purl.obolibrary.org/obo/OMIT_0016265 (down-regulation)')
circRNA_disease['Dysfunction Pattern'] = circRNA_disease['Dysfunction Pattern'].str.replace('up-regulated','http://purl.obolibrary.org/obo/OMIT_0016489 (up-regulation)')
circRNA_disease['Dysfunction Pattern'] = circRNA_disease['Dysfunction Pattern'].str.replace('differentially expressed','http://purl.obolibrary.org/obo/OBI_0002584 (differential expression analysis data)')

circRNA_disease['Dysfunction Pattern'] = circRNA_disease['Dysfunction Pattern'].str.replace(']', '')
circRNA_disease['Dysfunction Pattern'] = circRNA_disease['Dysfunction Pattern'].str.replace('regulation', 'http://purl.obolibrary.org/obo/GO_0010468 (regulation of gene expression)')
circRNA_disease['Dysfunction Pattern'].unique()[:5]

- Validated method.

In [None]:
circRNA_disease['Validated Method'] = circRNA_disease['Validated Method'].str.replace('wetern', 'western')
circRNA_disease['Validated Method'] = circRNA_disease['Validated Method'].str.replace('assay assay', 'assay')
circRNA_disease['Validated Method'] = circRNA_disease['Validated Method'].str.lower()
circRNA_disease['Validated Method'] = circRNA_disease['Validated Method'].str.replace('//', '|')
circRNA_disease['Validated Method'] = circRNA_disease['Validated Method'].str.replace(', ', '|')
circRNA_disease['Validated Method'] = [replace_with_ncit(item) for item in circRNA_disease['Validated Method']]
circRNA_disease['Validated Method'].unique()[:5]

- Sample.

In [None]:
circRNA_disease.Sample = circRNA_disease.Sample.str.replace(';', '|')
circRNA_disease.Sample = circRNA_disease.Sample.str.replace(',', '|')
circRNA_disease.Sample = circRNA_disease.Sample.str.replace('(', '')
circRNA_disease.Sample = circRNA_disease.Sample.str.replace(')', '')
circRNA_disease.Sample = circRNA_disease.Sample.str.replace(' as well as ', '|')
circRNA_disease.Sample = circRNA_disease.Sample.str.replace(' and ', '|')
circRNA_disease.Sample = circRNA_disease.Sample.str.replace(' |', '|')
circRNA_disease.Sample = circRNA_disease.Sample.str.replace('| ', '|')
circRNA_disease.Sample = circRNA_disease.Sample.str.replace(' | ', '|')
circRNA_disease.Sample = circRNA_disease.Sample.str.replace(', ', '|')
circRNA_disease.Sample = [replace_with_clo(item) for item in circRNA_disease.Sample]
circRNA_disease.Sample = [replace_with_clo2(item) for item in circRNA_disease.Sample]
circRNA_disease.Sample = [replace_with_uberon(item) for item in circRNA_disease.Sample]
circRNA_disease.Sample = [replace_with_uberon2(item) for item in circRNA_disease.Sample]
circRNA_disease.Sample.unique()[:5]

In [None]:
circRNA_disease

In [None]:
circRNA_disease.rename(columns={'0_y':'circRNA', 1:'Disease', 'PubMed ID':'References (PMID)'},inplace=True)

In [None]:
merge_rows(circRNA_disease, 'circRNA', 'Disease').to_csv(edge_data_location + 'RcircRNA-disease.txt', sep='\t', index=None)

***
### lncRNA-chemical - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
#LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
lncRNA_chemical = LncRNAWiki[LncRNAWiki['drug'].notna()]
lncRNA_chemical = lncRNA_chemical.drop(columns=['synonyms','gene_locus','gene_id'])

lncRNA_chemical['drug'] = lncRNA_chemical.drug.str.split(';')
lncRNA_chemical = lncRNA_chemical.explode('drug')
lncRNA_chemical = pd.merge(desc_chebi_map.rename(columns={0:'drug'}), lncRNA_chemical, on=['drug'])
lncRNA_chemical = pd.merge(lncRNA_chemical,symbol_entrez_map.rename(columns={'0_x':'symbol'}),on='symbol')
lncRNA_chemical = lncRNA_chemical.drop(columns=['drug','symbol','genome_variation','variation_detail',
                                               'target_interaction','molecular_function'])
lncRNA_chemical['0_y'] = lncRNA_chemical['0_y'].astype(str)+'?lncRNA'
lncRNA_chemical['Source(s)'] = 'LncRNAWiki'
lncRNA_chemical

Manual fix of inconsistencies.

In [None]:
lncRNA_chemical['pmid'] = 'https://pubmed.ncbi.nlm.nih.gov/' + lncRNA_chemical['pmid'].astype('Int64', errors='ignore').astype('str')
lncRNA_chemical['pmid'] = lncRNA_chemical['pmid'].replace('https://pubmed.ncbi.nlm.nih.gov/nan', np.nan)

lncRNA_chemical.transcript_id = lncRNA_chemical.transcript_id.str.replace(',','|')

lncRNA_chemical.conservation_ortholog = lncRNA_chemical.conservation_ortholog.str.replace('Human;7SL', 'https://www.ncbi.nlm.nih.gov/gene/6029')
lncRNA_chemical.conservation_ortholog = lncRNA_chemical.conservation_ortholog.str.replace('Mouse', 'http://purl.obolibrary.org/obo/NCBITaxon_10090 (mus musculus)')

lncRNA_chemical.biological_context = lncRNA_chemical.biological_context.str.replace('Disease', 'http://purl.obolibrary.org/obo/MONDO_0000001 (disease)')

lncRNA_chemical['epigenetic_modification'] = lncRNA_chemical['epigenetic_modification'].str.replace('DNA methylation','http://purl.obolibrary.org/obo/GO_0006306 (dna methylation)')

lncRNA_chemical['expression'] = lncRNA_chemical['expression'].str.replace(
    'RNA', 'http://purl.obolibrary.org/obo/SO_0000356')

lncRNA_chemical['expression_detail'] = lncRNA_chemical['expression_detail'].str.replace('Down-regulated','http://purl.obolibrary.org/obo/OMIT_0016265 (down-regulation)')
lncRNA_chemical['expression_detail'] = lncRNA_chemical['expression_detail'].str.replace('Up-regulated','http://purl.obolibrary.org/obo/OMIT_0016489 (up-regulation)')
lncRNA_chemical['expression_detail'] = lncRNA_chemical['expression_detail'].str.replace('Differentially expressed','http://purl.obolibrary.org/obo/OBI_0002584 (differential expression)')

lncRNA_chemical['regulator_type'] = lncRNA_chemical['regulator_type'].str.replace(';', '|')
lncRNA_chemical['regulator_type'] = lncRNA_chemical['regulator_type'].str.replace('Protein', 'http://purl.obolibrary.org/obo/PR_000000001 (protein)')
lncRNA_chemical['regulator_type'] = lncRNA_chemical['regulator_type'].str.replace('TF', 'http://purl.obolibrary.org/obo/NCIT_C17207 (transcription factor)')
lncRNA_chemical['regulator_type'] = lncRNA_chemical['regulator_type'].str.replace('PCG', 'http://purl.obolibrary.org/obo/SO_0001217 (protein_coding_gene)')

lncRNA_chemical['regulator_interaction'] = lncRNA_chemical['regulator_interaction'].str.replace(';', '|')
lncRNA_chemical['regulator_interaction'] = lncRNA_chemical['regulator_interaction'].str.replace('Protein-DNA', 'http://purl.obolibrary.org/obo/NCIT_C18755 (dna-protein interaction)')
lncRNA_chemical['regulator_interaction'] = lncRNA_chemical['regulator_interaction'].str.replace('Protein-RNA', 'http://purl.obolibrary.org/obo/NCIT_C19019 (rna-protein interaction)')

lncRNA_chemical['regulator_effect'] = lncRNA_chemical['regulator_effect'].str.replace(';', '|')
lncRNA_chemical['regulator_effect'] = lncRNA_chemical['regulator_effect'].str.replace('promote', 'http://purl.obolibrary.org/obo/NCIT_C61391 (promotion)')
lncRNA_chemical['regulator_effect'] = lncRNA_chemical['regulator_effect'].str.replace('inhibit', 'http://purl.obolibrary.org/obo/NCIT_C42791 (inhibition)')

lncRNA_chemical['target_type'] = lncRNA_chemical['target_type'].str.replace(';', '|')
lncRNA_chemical['target_type'] = lncRNA_chemical['target_type'].str.replace('miRNA', 'http://purl.obolibrary.org/obo/SO_0000276 (mirna)')
lncRNA_chemical['target_type'] = lncRNA_chemical['target_type'].str.replace('TF', 'http://purl.obolibrary.org/obo/NCIT_C17207 (transcription factor)')
lncRNA_chemical['target_type'] = lncRNA_chemical['target_type'].str.replace('PCG', 'http://purl.obolibrary.org/obo/SO_0001217 (protein_coding_gene)')
lncRNA_chemical['target_type'] = lncRNA_chemical['target_type'].str.replace('lncRNA', 'http://purl.obolibrary.org/obo/SO_0001877 (lncrna)')

lncRNA_chemical['target_effect'] = lncRNA_chemical['target_effect'].str.replace(';', '|').str.lower()
lncRNA_chemical['target_effect'] = lncRNA_chemical['target_effect'].str.replace('promote', 'http://purl.obolibrary.org/obo/NCIT_C61391 (promotion)')
lncRNA_chemical['target_effect'] = lncRNA_chemical['target_effect'].str.replace('inhibit', 'http://purl.obolibrary.org/obo/NCIT_C42791 (inhibition)')

lncRNA_chemical['clinical_detail'] = lncRNA_chemical['clinical_detail'].str.replace(';', '|')
lncRNA_chemical['clinical_detail'] = lncRNA_chemical['clinical_detail'].str.replace('recurrence', 'http://purl.obolibrary.org/obo/NCIT_C3352 (recurrence)')
lncRNA_chemical['clinical_detail'] = lncRNA_chemical['clinical_detail'].str.replace('prognosis', 'http://purl.obolibrary.org/obo/OGMS_0000093 (prognosis)')
lncRNA_chemical['clinical_detail'] = lncRNA_chemical['clinical_detail'].str.replace('survival', 'http://purl.obolibrary.org/obo/NCIT_C17177 (survival)')
lncRNA_chemical['clinical_detail'] = lncRNA_chemical['clinical_detail'].str.replace('metastasis', 'http://purl.obolibrary.org/obo/NCIT_C19151 (metastasis)')
lncRNA_chemical['clinical_detail'] = lncRNA_chemical['clinical_detail'].str.replace('drug', 'http://purl.obolibrary.org/obo/CHEBI_23888 (drug)')

lncRNA_chemical['functional_mechanism'] = lncRNA_chemical['functional_mechanism'].str.replace(';', '|')
lncRNA_chemical['functional_mechanism'] = lncRNA_chemical['functional_mechanism'].str.replace('Post-transcriptional regulation', 'http://purl.obolibrary.org/obo/NCIT_C18952 (post-transcriptional regulation)')
lncRNA_chemical['functional_mechanism'] = lncRNA_chemical['functional_mechanism'].str.replace('Transcriptional regulation', 'http://purl.obolibrary.org/obo/NCIT_C19077 (transcriptional regulation)')
lncRNA_chemical['functional_mechanism'] = lncRNA_chemical['functional_mechanism'].str.replace('Epigenetic regulation', 'http://purl.obolibrary.org/obo/GO_0040029 (epigenetic regulation of gene expression)')

lncRNA_chemical['regulator'] = lncRNA_chemical['regulator'].str.replace(';', '|').str.lower()
lncRNA_chemical['regulator'] = [replace_with_pro(item) for item in lncRNA_chemical['regulator']]

lncRNA_chemical['target'] = lncRNA_chemical['target'].str.replace('miR', 'hsa-miR')
lncRNA_chemical['target'] = [replace_with_mirbase(item) for item in lncRNA_chemical['target']]
lncRNA_chemical['target'] = lncRNA_chemical['target'].str.replace(';', '|').str.lower()
lncRNA_chemical['target'] = [replace_with_pro(item) for item in lncRNA_chemical['target']]

lncRNA_chemical.context_detail = [replace_with_mondo(item) for item in lncRNA_chemical.context_detail]


lncRNA_chemical['tissue/cell line'] = lncRNA_chemical['tissue/cell line'].str.lower()
lncRNA_chemical['tissue/cell line'] = lncRNA_chemical['tissue/cell line'].str.replace(';', '|')
lncRNA_chemical['tissue/cell line'] = [replace_with_clo(item) for item in lncRNA_chemical['tissue/cell line']]
lncRNA_chemical['tissue/cell line'] = [replace_with_clo2(item) for item in lncRNA_chemical['tissue/cell line']]
lncRNA_chemical['tissue/cell line'] = [replace_with_uberon(item) for item in lncRNA_chemical['tissue/cell line']]


lncRNA_chemical['experimental_method'] = lncRNA_chemical['experimental_method'].str.replace('wetern', 'western')
lncRNA_chemical['experimental_method'] = lncRNA_chemical['experimental_method'].str.replace('assay assay', 'assay')
lncRNA_chemical['experimental_method'] = lncRNA_chemical['experimental_method'].str.lower()
lncRNA_chemical['experimental_method'] = lncRNA_chemical['experimental_method'].str.replace(';', '|')
lncRNA_chemical['experimental_method'] = [replace_with_ncit(item) for item in lncRNA_chemical['experimental_method']]

lncRNA_chemical['biological_process'] = lncRNA_chemical['biological_process'].str.lower()
lncRNA_chemical['biological_process'] = lncRNA_chemical['biological_process'].str.replace(';', '|')
lncRNA_chemical['biological_process'] = [replace_with_go(item) for item in lncRNA_chemical['biological_process']]

lncRNA_chemical['pathway'] = lncRNA_chemical['pathway'].str.lower()
lncRNA_chemical['pathway'] = lncRNA_chemical['pathway'].str.replace(';', '|')
lncRNA_chemical['pathway'] = lncRNA_chemical['pathway'].str.replace('nf-??b', 'nuclear factor kappa B')
lncRNA_chemical['pathway'] = lncRNA_chemical['pathway'].str.replace('/??-catenin', '')
lncRNA_chemical['pathway'] = lncRNA_chemical['pathway'].str.replace('pi3k/akt ', '')
lncRNA_chemical['pathway'] = [replace_with_pw(item) for item in lncRNA_chemical['pathway']]

In [None]:
lncRNA_chemical.rename(columns={1:'chemical'},inplace=True)
lncRNA_chemical.columns = lncRNA_chemical.columns.str.replace('_', ' ').str.capitalize()
lncRNA_chemical.rename(columns={'0 y':'lncRNA', 'Pmid':'References (PMID)', 'Transcript id': 'Transcript ID'},inplace=True)
lncRNA_chemical.insert(0,'lncRNA',lncRNA_chemical.pop('lncRNA'))
lncRNA_chemical.insert(1,'Chemical',lncRNA_chemical.pop('Chemical'))

In [None]:
merge_rows(lncRNA_chemical, 'lncRNA', 'Chemical').to_csv(edge_data_location + 'RlncRNA-chemical.txt', sep='\t', index=None)

***
### lncRNA-protein

* [LncBook](https://ngdc.cncb.ac.cn/lncbook/) <br /> LncBook accommodates a high-quality collection of human lncRNA genes and transcripts, and incorporates their abundant annotations at different omics levels, thereby enabling users to decipher functional signatures of lncRNAs in human diseases and different biological contexts. 

#### gene product of 

In [None]:
! wget https://ngdc.cncb.ac.cn/lncbook/files/sprotein_LncBook2.0.csv.gz

In [None]:
lncRNA_protein = pd.read_csv(unprocessed_data_location + 'sprotein_LncBook2.0.csv.gz') 
lncRNA_protein = lncRNA_protein[lncRNA_protein['Symbol']!='-']
lncRNA_protein.drop(columns=['Gene ID','Transcript ID','SmProt Loci','SmProt Protein Sequence'],inplace=True)
lncRNA_protein

In [None]:
lncRNA_protein = pd.merge(lncRNA_protein, symbol_entrez_map.rename(columns={'0_x':'Symbol'}), on='Symbol')
lncRNA_protein['0_y'] = lncRNA_protein['0_y'].astype(str) + '?lncRNA'
lncRNA_protein.drop(columns='Symbol',inplace=True)
lncRNA_protein['Source(s)'] = 'LncBook'
lncRNA_protein

In [None]:
print(lncRNA_protein['Experimental Evidence'].unique())
lncRNA_protein['Experimental Evidence'] = lncRNA_protein['Experimental Evidence'].str.replace('Ribo-seq','http://www.ebi.ac.uk/efo/EFO_0008891 (ribo-seq)')
lncRNA_protein['Experimental Evidence'] = lncRNA_protein['Experimental Evidence'].str.replace('Mass spectrometry','http://purl.obolibrary.org/obo/PRIDE_0000027 (mass spectrometry)')

In [None]:
lncRNA_protein.rename(columns={'0_y':'lncRNA', 'SmProt ID':'Small protein'},inplace=True)
lncRNA_protein.insert(1,'lncRNA',lncRNA_protein.pop('lncRNA'))

In [None]:
merge_rows(lncRNA_protein, 'Small protein', 'lncRNA').to_csv(edge_data_location + 'RsmallProtein-lncRNA.txt', sep='\t', index=None)

#### interacts with

In [None]:
! wget https://ngdc.cncb.ac.cn/lncbook/files/lncrna_rbp_LncBook2.0.csv.gz

In [None]:
lncRNA_protein = pd.read_csv(unprocessed_data_location + 'lncrna_rbp_LncBook2.0.csv.gz')  
lncRNA_protein = lncRNA_protein[lncRNA_protein['Symbol']!='-']
lncRNA_protein.drop(columns=['Gene ID','Length'],inplace=True)
lncRNA_protein['Source(s)'] = 'LncBook'
lncRNA_protein

***
* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
#LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
lncRNA_protein2 = LncRNAWiki[LncRNAWiki['target_type'].notna()]
lncRNA_protein2 = pd.concat([lncRNA_protein2[lncRNA_protein2.target_type.str.contains('TF')],
    lncRNA_protein2[lncRNA_protein2.target_type.str.contains('protein')]])
lncRNA_protein2 = lncRNA_protein2.drop(columns=['synonyms','gene_locus','gene_id'])
lncRNA_protein2['target'] = lncRNA_protein2.target.str.split(';')
lncRNA_protein2 = lncRNA_protein2.explode('target')
lncRNA_protein2.rename(columns={'symbol':'Symbol', 'target':'Protein'},inplace=True)
lncRNA_protein2['Source(s)'] = 'LncRNAWiki'
lncRNA_protein2
# We don't care about miRNA wrongly labeled as TF as they will be discarded when terms will be mapped on PRO

In [None]:
lncRNA_protein = pd.merge(lncRNA_protein, lncRNA_protein2, how='outer', on=['Symbol','Protein'])
lncRNA_protein = pd.merge(lncRNA_protein, symbol_entrez_map.rename(columns={'0_x':'Symbol'}), on='Symbol')
lncRNA_protein['0_y'] = lncRNA_protein['0_y'].astype(str) + '?lncRNA'
lncRNA_protein = pd.merge(lncRNA_protein, symbol_to_pro.rename(columns={'0_x':'Protein'}), on='Protein')
lncRNA_protein.drop(columns=['Symbol','Protein'],inplace=True)

lncRNA_protein['Source(s)_x'] = lncRNA_protein['Source(s)_x'].astype(str)
lncRNA_protein['Source(s)_y'] = lncRNA_protein['Source(s)_y'].astype(str)
lncRNA_protein['Source(s)'] = lncRNA_protein['Source(s)_x'] + '|' + lncRNA_protein['Source(s)_y']
lncRNA_protein = lncRNA_protein.drop(columns=['Source(s)_x', 'Source(s)_y'])
lncRNA_protein['Source(s)'] = lncRNA_protein['Source(s)'].str.replace('nan\||\|nan', '', regex=True)

lncRNA_protein['Transcript ID'] = lncRNA_protein['Transcript ID'].astype(str)
lncRNA_protein['transcript_id'] = lncRNA_protein['transcript_id'].astype(str)
lncRNA_protein['Transcript ID'].fillna(lncRNA_protein['transcript_id'], inplace=True)
lncRNA_protein.loc[(~lncRNA_protein['Transcript ID'].isna()) & (~lncRNA_protein['transcript_id'].isna()) &
                   (lncRNA_protein['Transcript ID'] != lncRNA_protein['transcript_id']),
                   ["Transcript ID"]] = lncRNA_protein["Transcript ID"
                                                  ].astype(str) + '|' + lncRNA_protein['transcript_id'].astype(str)
lncRNA_protein['Transcript ID'] = lncRNA_protein['Transcript ID'].str.replace('nan\||\|nan', '', regex=True)
lncRNA_protein['Transcript ID'] = lncRNA_protein['Transcript ID'].str.replace(',', '|')

lncRNA_protein.drop(columns=['transcript_id', 'genome_variation', 'variation_detail', 'molecular_function',
                             'target_interaction'],inplace=True)

lncRNA_protein

Manual fix.

In [None]:
lncRNA_protein['pmid'] = 'https://pubmed.ncbi.nlm.nih.gov/' + lncRNA_protein['pmid'].astype('Int64', errors='ignore').astype('str')
lncRNA_protein['pmid'] = lncRNA_protein['pmid'].replace('https://pubmed.ncbi.nlm.nih.gov/<NA>', np.nan)

lncRNA_protein['Transcript ID'] = lncRNA_protein['Transcript ID'] .str.replace(',','|')

lncRNA_protein['Cell Line'] = lncRNA_protein['Cell Line'] .str.replace('HepG2','http://www.ebi.ac.uk/efo/EFO_0001187 (hepg2)')
lncRNA_protein['Cell Line'] = lncRNA_protein['Cell Line'] .str.replace('K562','http://purl.obolibrary.org/obo/CLO_0007060 (k562 cl.6 cell)')

lncRNA_protein.conservation_ortholog = lncRNA_protein.conservation_ortholog.str.replace('Mouse', 'http://purl.obolibrary.org/obo/NCBITaxon_10090 (mus musculus)')

lncRNA_protein.biological_context = lncRNA_protein.biological_context.str.replace('Disease', 'http://purl.obolibrary.org/obo/MONDO_0000001 (disease)')

lncRNA_protein['epigenetic_modification'] = lncRNA_protein['epigenetic_modification'].str.replace('DNA methylation','http://purl.obolibrary.org/obo/GO_0006306 (dna methylation)')
lncRNA_protein['epigenetic_modification'] = lncRNA_protein['epigenetic_modification'].str.replace('Epigenetic change','http://purl.obolibrary.org/obo/NCIT_C21051 (epigenetic process)')

lncRNA_protein['modification_detail'] = lncRNA_protein['modification_detail'].str.replace('methylation','http://purl.obolibrary.org/obo/GO_0032259 (methylation)')
lncRNA_protein['modification_detail'] = lncRNA_protein['modification_detail'].str.replace('hypomethylation','http://purl.obolibrary.org/obo/NCIT_C121521 (hypomethylation)')

lncRNA_protein['expression'] = lncRNA_protein['expression'].str.replace(
    'RNA', 'http://purl.obolibrary.org/obo/SO_0000356 (rna)')

lncRNA_protein['expression_detail'] = lncRNA_protein['expression_detail'].str.replace('Down-regulated','http://purl.obolibrary.org/obo/OMIT_0016265 (down-regulation)')
lncRNA_protein['expression_detail'] = lncRNA_protein['expression_detail'].str.replace('Up-regulated','http://purl.obolibrary.org/obo/OMIT_0016489 (up-regulation)')
lncRNA_protein['expression_detail'] = lncRNA_protein['expression_detail'].str.replace('Differentially expressed','http://purl.obolibrary.org/obo/OBI_0002584 (differential expression)')

lncRNA_protein['regulator_type'] = lncRNA_protein['regulator_type'].str.replace('Protein', 'http://purl.obolibrary.org/obo/PR_000000001 (protein)')
lncRNA_protein['regulator_type'] = lncRNA_protein['regulator_type'].str.replace('TF', 'http://purl.obolibrary.org/obo/NCIT_C17207 (transcription factor)')

lncRNA_protein = pd.merge(lncRNA_protein, symbol_to_pro[['0_x','1_y']].rename(columns={'0_x':'regulator'}), on='regulator', how='left')
lncRNA_protein['1_y_y'] = 'http://purl.obolibrary.org/obo/' + lncRNA_protein['1_y_y'].astype('str')
lncRNA_protein['1_y_y'] = lncRNA_protein['1_y_y'].replace('http://purl.obolibrary.org/obo/nan', np.nan)

lncRNA_protein['regulator_interaction'] = lncRNA_protein['regulator_interaction'].str.replace('Protein-DNA', 'http://purl.obolibrary.org/obo/NCIT_C18755 (dna-protein interaction)')

lncRNA_protein['regulator_effect'] = lncRNA_protein['regulator_effect'].str.replace('promote', 'http://purl.obolibrary.org/obo/NCIT_C61391 (promotion)')
lncRNA_protein['regulator_effect'] = lncRNA_protein['regulator_effect'].str.replace('inhibit', 'http://purl.obolibrary.org/obo/NCIT_C42791 (inhibition)')

lncRNA_protein['clinical_detail'] = lncRNA_protein['clinical_detail'].str.replace(';', '|')
lncRNA_protein['clinical_detail'] = lncRNA_protein['clinical_detail'].str.replace('recurrence', 'http://purl.obolibrary.org/obo/NCIT_C3352 (recurrence)')
lncRNA_protein['clinical_detail'] = lncRNA_protein['clinical_detail'].str.replace('prognosis', 'http://purl.obolibrary.org/obo/OGMS_0000093 (prognosis)')
lncRNA_protein['clinical_detail'] = lncRNA_protein['clinical_detail'].str.replace('survival', 'http://purl.obolibrary.org/obo/NCIT_C17177 (survival)')
lncRNA_protein['clinical_detail'] = lncRNA_protein['clinical_detail'].str.replace('metastasis', 'http://purl.obolibrary.org/obo/NCIT_C19151 (metastasis)')
lncRNA_protein['clinical_detail'] = lncRNA_protein['clinical_detail'].str.replace('drug', 'http://purl.obolibrary.org/obo/CHEBI_23888 (drug)')

lncRNA_protein['functional_mechanism'] = lncRNA_protein['functional_mechanism'].str.replace(';', '|')
lncRNA_protein['functional_mechanism'] = lncRNA_protein['functional_mechanism'].str.replace('Post-transcriptional regulation', 'http://purl.obolibrary.org/obo/NCIT_C18952 (post-transcriptional regulation)')
lncRNA_protein['functional_mechanism'] = lncRNA_protein['functional_mechanism'].str.replace('Transcriptional regulation', 'http://purl.obolibrary.org/obo/NCIT_C19077 (transcriptional regulation)')
lncRNA_protein['functional_mechanism'] = lncRNA_protein['functional_mechanism'].str.replace('Epigenetic regulation', 'http://purl.obolibrary.org/obo/GO_0040029 (epigenetic regulation of gene expression)')

lncRNA_protein['target_type'] = lncRNA_protein['target_type'].str.replace('miRNA', 'http://purl.obolibrary.org/obo/SO_0000276 (mirna)')
lncRNA_protein['target_type'] = lncRNA_protein['target_type'].str.replace('TF', 'http://purl.obolibrary.org/obo/NCIT_C17207 (transcription factor)')
lncRNA_protein['target_type'] = lncRNA_protein['target_type'].str.replace('PCG', 'http://purl.obolibrary.org/obo/SO_0001217 (protein_coding_gene)')
lncRNA_protein['target_type'] = lncRNA_protein['target_type'].str.replace('lncRNA', 'http://purl.obolibrary.org/obo/SO_0001877 (lncrna)')

lncRNA_protein['target_effect'] = lncRNA_protein['target_effect'].str.replace(';', '|')
lncRNA_protein['target_effect'] = lncRNA_protein['target_effect'].str.replace('promote', 'http://purl.obolibrary.org/obo/NCIT_C61391 (promotion)')
lncRNA_protein['target_effect'] = lncRNA_protein['target_effect'].str.replace('inhibit', 'http://purl.obolibrary.org/obo/NCIT_C42791 (inhibition)')

lncRNA_protein.context_detail = [replace_with_mondo(item) for item in lncRNA_protein.context_detail]

lncRNA_protein['tissue/cell line'] = lncRNA_protein['tissue/cell line'].str.lower()
lncRNA_protein['tissue/cell line'] = lncRNA_protein['tissue/cell line'].str.replace(';', '|')
lncRNA_protein['tissue/cell line'] = [replace_with_clo(item) for item in lncRNA_protein['tissue/cell line']]
lncRNA_protein['tissue/cell line'] = [replace_with_clo2(item) for item in lncRNA_protein['tissue/cell line']]
lncRNA_protein['tissue/cell line'] = [replace_with_uberon(item) for item in lncRNA_protein['tissue/cell line']]

lncRNA_protein['experimental_method'] = lncRNA_protein['experimental_method'].str.replace('wetern', 'western')
lncRNA_protein['experimental_method'] = lncRNA_protein['experimental_method'].str.replace('assay assay', 'assay')
lncRNA_protein['experimental_method'] = lncRNA_protein['experimental_method'].str.lower()
lncRNA_protein['experimental_method'] = lncRNA_protein['experimental_method'].str.replace(';', '|')
lncRNA_protein['experimental_method'] = [replace_with_ncit(item) for item in lncRNA_protein['experimental_method']]

lncRNA_protein['biological_process'] = lncRNA_protein['biological_process'].str.lower()
lncRNA_protein['biological_process'] = lncRNA_protein['biological_process'].str.replace(';', '|')
lncRNA_protein['biological_process'] = [replace_with_go(item) for item in lncRNA_protein['biological_process']]

lncRNA_protein['pathway'] = lncRNA_protein['pathway'].str.lower()
lncRNA_protein['pathway'] = lncRNA_protein['pathway'].str.replace(';', '|')
lncRNA_protein['pathway'] = lncRNA_protein['pathway'].str.replace('nf-??b', 'nuclear factor kappa B')
lncRNA_protein['pathway'] = lncRNA_protein['pathway'].str.replace('/??-catenin', '')
lncRNA_protein['pathway'] = lncRNA_protein['pathway'].str.replace('pi3k/akt ', '')
lncRNA_protein['pathway'] = [replace_with_pw(item) for item in lncRNA_protein['pathway']]

lncRNA_protein['drug'] = lncRNA_protein['drug'].str.lower()
lncRNA_protein['drug'] = lncRNA_protein['drug'].str.replace(';', '|')

lncRNA_protein['drug'] = [replace_with_drugbank(item) for item in lncRNA_protein['drug']]

In [None]:
lncRNA_protein.columns = lncRNA_protein.columns.str.replace('_', ' ').str.capitalize()
lncRNA_protein.rename(columns={'0 y':'lncRNA','1 y x':'Protein', 'Pmid':'References (PMID)', 'Transcript id': 'Transcript ID',
                               '1 y y': 'Regulator'},inplace=True)
lncRNA_protein.insert(0,'lncRNA',lncRNA_protein.pop('lncRNA'))
lncRNA_protein.insert(1,'Protein',lncRNA_protein.pop('Protein'))
lncRNA_protein.insert(len(lncRNA_protein.columns)-1,'Source(s)',lncRNA_protein.pop('Source(s)'))

In [None]:
merge_rows(lncRNA_protein,'lncRNA','Protein').to_csv(edge_data_location + 'RlncRNA-protein.txt', sep='\t', index=None)

***
### lncRNA-biological context - http://purl.obolibrary.org/obo/RO_0002245 + 2246 + 2291 (over-expressed in + under-expressed in + ubiquitously expressed in)

* [LncBook](https://ngdc.cncb.ac.cn/lncbook/)

In [None]:
! wget https://ngdc.cncb.ac.cn/lncbook/files/expression_LncBook2.0.csv.gz

In [None]:
lncRNA_expression = pd.read_csv(unprocessed_data_location + 'expression_LncBook2.0.csv.gz') 
lncRNA_expression = lncRNA_expression[lncRNA_expression['Symbol']!='-']
lncRNA_expression.drop(columns=['Gene ID','Featured Expression'],inplace=True)
lncRNA_expression['Symbol'] = lncRNA_expression.Symbol.str.split(',')
lncRNA_expression = lncRNA_expression.explode('Symbol')
lncRNA_expression.rename(columns={'Normal Tissue/Cell Line': 'UBERON_0000479',
                                 'Organ Development':'GO_0048513',
                                 'Preimplantation Embryo':'GO_0007566',
                                 'Cell Differentiation':'GO_0030154',
                                 'Subcellular Localization':'GO_0051179',
                                 'Exosome':'GO_0070062',
                                 'Cancer Cell Line':'CLO_0009828',
                                 'Virus Infection':'MONDO_0005108',
                                 'Circadian Rhythm':'GO_0007623'},inplace=True)
lncRNA_expression

In [None]:
# HC
HCfinal=pd.DataFrame()
# NE
NEfinal=pd.DataFrame()
# MC
MCfinal=pd.DataFrame()
# LC
LCfinal=pd.DataFrame()

for i in ['UBERON_0000479','GO_0048513','GO_0007566','GO_0030154','GO_0051179',
          'GO_0070062','CLO_0009828','MONDO_0005108','GO_0007623']:
    HC = lncRNA_expression[['Symbol','Expression Capacity',i]][lncRNA_expression[['Symbol','Expression Capacity',i]][i]=='HC']
    HC[i]=i
    HC.rename(columns={i:'HC'},inplace=True)
    HCfinal = pd.concat([HCfinal,HC])
    
    NE = lncRNA_expression[['Symbol','Expression Capacity',i]][lncRNA_expression[['Symbol','Expression Capacity',i]][i]=='NE']
    NE[i]=i
    NE.rename(columns={i:'NE'},inplace=True)
    NEfinal = pd.concat([NEfinal,NE])
    
    MC = lncRNA_expression[['Symbol','Expression Capacity',i]][lncRNA_expression[['Symbol','Expression Capacity',i]][i]=='MC']
    MC[i]=i
    MC.rename(columns={i:'MC'},inplace=True)
    MCfinal = pd.concat([MCfinal,MC])

    LC = lncRNA_expression[['Symbol','Expression Capacity',i]][lncRNA_expression[['Symbol','Expression Capacity',i]][i]=='LC']
    LC[i]=i
    LC.rename(columns={i:'LC'},inplace=True)
    LCfinal = pd.concat([LCfinal,LC])
 
HCfinal=pd.merge(HCfinal,symbol_entrez_map.rename(columns={'0_x':'Symbol'}),on='Symbol')
HCfinal.drop(columns='Symbol',inplace=True)
HCfinal.insert(0,'0_y',HCfinal.pop('0_y'))
HCfinal['0_y'] = HCfinal['0_y'].astype(str)+'?lncRNA'
HCfinal['Source(s)'] = 'LncBook'
NEfinal=pd.merge(NEfinal,symbol_entrez_map.rename(columns={'0_x':'Symbol'}),on='Symbol')
NEfinal.drop(columns='Symbol',inplace=True)
NEfinal.insert(0,'0_y',NEfinal.pop('0_y'))
NEfinal['0_y'] = NEfinal['0_y'].astype(str)+'?lncRNA'
NEfinal['Source(s)'] = 'LncBook'
MCfinal=pd.merge(MCfinal,symbol_entrez_map.rename(columns={'0_x':'Symbol'}),on='Symbol')
MCfinal.drop(columns='Symbol',inplace=True)
MCfinal.insert(0,'0_y',MCfinal.pop('0_y'))
MCfinal['0_y'] = MCfinal['0_y'].astype(str)+'?lncRNA'
MCfinal['Source(s)'] = 'LncBook'
LCfinal=pd.merge(LCfinal,symbol_entrez_map.rename(columns={'0_x':'Symbol'}),on='Symbol')
LCfinal.drop(columns='Symbol',inplace=True)
LCfinal.insert(0,'0_y',LCfinal.pop('0_y'))
LCfinal['0_y'] = LCfinal['0_y'].astype(str)+'?lncRNA'
LCfinal['Source(s)'] = 'LncBook'
HCfinal

In [None]:
HCfinal.rename(columns={'0_y':'lncRNA','HC':'Biological context'},inplace=True)
LCfinal.rename(columns={'0_y':'lncRNA','LC':'Biological context'},inplace=True)
MCfinal.rename(columns={'0_y':'lncRNA','MC':'Biological context'},inplace=True)
NEfinal.rename(columns={'0_y':'lncRNA','NE':'Biological context'},inplace=True)

HCfinal.drop(columns=['Expression Capacity']).drop_duplicates().to_csv(
    edge_data_location + 'RlncRNA-biologicalContext2245.txt', sep='\t', index=None)
LCfinal.drop(columns=['Expression Capacity']).drop_duplicates().to_csv(
    edge_data_location + 'RlncRNA-biologicalContext2246.txt', sep='\t', index=None)
MCfinal.drop(columns=['Expression Capacity']).drop_duplicates().to_csv(
    edge_data_location + 'RlncRNA-biologicalContext2291.txt', sep='\t', index=None)
# Absent in RO properties: NOT EXPRESSED IN
#NEfinal.drop(columns=['Expression Capacity']).drop_duplicates().to_csv(
    #edge_data_location + 'lncRNA-biologicalContextNE.txt', sep='\t', index=None)

***
### lncRNA-biological role - http://purl.obolibrary.org/obo/RO_0002260 (has biological role)

* [dbEssLnc](https://esslnc.pufengdu.org/home) <br /> dbEssLnc contains lncRNA annotations; data are constently added by manual screening. 

In [None]:
! wget https://esslnc.pufengdu.org/data/essential%20lncRNA.json --no-check-certificate

In [None]:
dbEssLnc = pd.read_json(unprocessed_data_location + 'essential lncRNA.json')
lncRNA_role = dbEssLnc[dbEssLnc['Organism']=='Human']
lncRNA_role.drop(columns=['ID','Name','Aliases','fId','NONCODEId','Organism'], inplace=True)
lncRNA_role['Source(s)'] = 'dbEssLnc'
lncRNA_role.Gene_Ontology_Annotations = lncRNA_role.Gene_Ontology_Annotations.replace('N.A.', np.nan)
lncRNA_role

In [None]:
print(lncRNA_role.Role.unique())
# For grounding purposes
lncRNA_role.replace('Tumor suppressor gene', 'Tumor-Suppressor-Gene', inplace=True)

In [None]:
lncRNA_role['PMID'] = 'https://pubmed.ncbi.nlm.nih.gov/' + lncRNA_role['PMID'].astype('Int64', errors='ignore').astype('str')
lncRNA_role['PMID'] = lncRNA_role['PMID'].replace('https://pubmed.ncbi.nlm.nih.gov/<NA>', np.nan)
lncRNA_role.NCBI_gene_Id = lncRNA_role.NCBI_gene_Id.astype(str) + '?lncRNA'
lncRNA_role.insert(1,'Role',lncRNA_role.pop('Role'))

lncRNA_role.rename(columns={'NCBI_gene_Id':'lncRNA', 'Role':'Biological role', 'PMID':'References (PMID)',
                            'Gene_Ontology_Annotations':'Gene Ontology annotations'},inplace=True)

lncRNA_role.drop_duplicates().to_csv(edge_data_location + 'RlncRNA-role.txt', sep='\t', index=None)

***
### lncRNA-cellular compartment - http://purl.obolibrary.org/obo/RO_0001018 (contained in)

* [lncATLAS](https://lncatlas.crg.eu/) <br /> LncATLAS displays the subcellular localisation for GENCODE-annotated lncRNAs. This localisation is expressed in units of Relative Concentration Index (RCI) - a comparison of the concentration of a gene, per unit mass of RNA, between two cellular compartments.

In [None]:
! wget https://lncatlas.crg.eu/session/014e12df4b0975891edb6d8ba3a33b0e/download/retrieveall?w=

In [None]:
lncRNA_comp = pd.read_csv(unprocessed_data_location + '2023-05-09_lncATLAS_all_data.csv')
lncRNA_comp = lncRNA_comp[['Data Source','Data Type','Value','Gene Name']]
lncRNA_comp = pd.merge(lncRNA_comp, symbol_entrez_map.rename(columns={'0_x':'Gene Name'}), on='Gene Name')
lncRNA_comp.drop(columns=['Gene Name'],inplace=True)
lncRNA_comp['0_y'] = lncRNA_comp['0_y'].astype(str)+'?lncRNA'
lncRNA_comp

In [None]:
# Data cleaning rule to estabilish relations: discard RCI below the mean
print(lncRNA_comp.Value.mean())

lncRNA_comp = lncRNA_comp[lncRNA_comp.Value >= lncRNA_comp.Value.mean()]

In [None]:
# Mapping to GO CC
lncRNA_comp['Data Type'].unique()

In [None]:
lncRNA_comp['gocc'] = lncRNA_comp['Data Type'].replace({'nucleus': 'GO_0005634', 'cytosol': 'GO_0005829',
                                                        'chromatin': 'GO_0000785', 'membrane': 'GO_0016020',
                                                        'nucleolus': 'GO_0005730', 'nucleoplasm': 'GO_0005654'})

lncRNA_comp = lncRNA_comp[lncRNA_comp['gocc'].astype(str).str.startswith('GO_')]
lncRNA_comp.drop(columns=['Data Type'],inplace=True)
lncRNA_comp['Source(s)'] = 'lncATLAS'
lncRNA_comp

Manual fix of inconsistencies.

In [None]:
lncRNA_comp['Data Source'] = lncRNA_comp['Data Source'].str.replace('.', ' ').str.lower()
lncRNA_comp['Data Source'] = [replace_with_clo(item) for item in lncRNA_comp['Data Source']]
lncRNA_comp['Data Source'] = [replace_with_clo2(item) for item in lncRNA_comp['Data Source']]

lncRNA_comp['Data Source'].unique()

In [None]:
lncRNA_comp.rename(columns={'0_y':'lncRNA', 'gocc':'Cellular component', 'Data Source':'Cell line'},inplace=True)
lncRNA_comp.insert(0,'lncRNA',lncRNA_comp.pop('lncRNA'))
lncRNA_comp.insert(1,'Cellular component',lncRNA_comp.pop('Cellular component'))

In [None]:
lncRNA_comp.drop_duplicates().to_csv(edge_data_location + 'RlncRNA-gocc.txt', sep='\t', index=None)

***
### lncRNA-pathway - http://purl.obolibrary.org/obo/RO_0000056 (participates in)

* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
#LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
lncRNA_pw = LncRNAWiki[LncRNAWiki['pathway'].notna()]
lncRNA_pw = lncRNA_pw.drop(columns=['synonyms','gene_locus','gene_id'])
lncRNA_pw.pathway = lncRNA_pw.pathway.str.lower()
lncRNA_pw

In [None]:
lncRNA_pw = pd.merge(lncRNA_pw, symbol_entrez_map.rename(columns={'0_x':'symbol'}), on='symbol')
lncRNA_pw = pd.merge(lncRNA_pw, desc_pw_map.rename(columns={0:'pathway'}), on='pathway').drop(columns=[
    'symbol','genome_variation','variation_detail','modification_detail','target_interaction',
    'molecular_function','pathway'])
lncRNA_pw['0_y'] = lncRNA_pw['0_y'].astype(str)+'?lncRNA'
lncRNA_pw['Source(s)'] = 'LncRNAWiki'
lncRNA_pw.head()

Manual fix of inconsistencies.

In [None]:
lncRNA_pw['pmid'] = 'https://pubmed.ncbi.nlm.nih.gov/' + lncRNA_pw['pmid'].astype('Int64', errors='ignore').astype('str')
lncRNA_pw['pmid'] = lncRNA_pw['pmid'].replace('https://pubmed.ncbi.nlm.nih.gov/<NA>', np.nan)

lncRNA_pw['transcript_id'] = lncRNA_pw['transcript_id'] .str.replace(',','|')

lncRNA_pw.conservation_ortholog = lncRNA_pw.conservation_ortholog.str.replace('Mouse', 'http://purl.obolibrary.org/obo/NCBITaxon_10090 (mus musculus)')

lncRNA_pw.biological_context = lncRNA_pw.biological_context.str.replace('Disease', 'http://purl.obolibrary.org/obo/MONDO_0000001 (disease)')
lncRNA_pw.biological_context = lncRNA_pw.biological_context.str.replace('Circadian', 'http://purl.obolibrary.org/obo/GO_0007623 (circadian rhythm)')
lncRNA_pw.biological_context = lncRNA_pw.biological_context.str.replace('Cell Differentiation', 'http://purl.obolibrary.org/obo/GO_0030154 (cell differentiation)')
lncRNA_pw.biological_context = lncRNA_pw.biological_context.str.replace('Preimplantation Embryo', 'http://purl.obolibrary.org/obo/GO_0007566 (embryo implantation)')

lncRNA_pw['epigenetic_modification'] = lncRNA_pw['epigenetic_modification'].str.replace('Histone modification','http://purl.obolibrary.org/obo/GO_0016570 (histone modification)')

lncRNA_pw['expression'] = lncRNA_pw['expression'].str.replace(
    'RNA', 'http://purl.obolibrary.org/obo/SO_0000356 (rna)')

lncRNA_pw['expression_detail'] = lncRNA_pw['expression_detail'].str.replace('Down-regulated','http://purl.obolibrary.org/obo/OMIT_0016265 (down-regulation)')
lncRNA_pw['expression_detail'] = lncRNA_pw['expression_detail'].str.replace('Up-regulated','http://purl.obolibrary.org/obo/OMIT_0016489 (up-regulation)')
lncRNA_pw['expression_detail'] = lncRNA_pw['expression_detail'].str.replace('Differentially expressed','http://purl.obolibrary.org/obo/OBI_0002584 (differential expression)')

lncRNA_pw['regulator_type'] = lncRNA_pw['regulator_type'].str.replace('PCG', 'http://purl.obolibrary.org/obo/SO_0001217 (protein_coding_gene)')
lncRNA_pw['regulator_type'] = lncRNA_pw['regulator_type'].str.replace('TF', 'http://purl.obolibrary.org/obo/NCIT_C1720 (transcription factor)')
lncRNA_pw['regulator_type'] = lncRNA_pw['regulator_type'].str.replace('PCG;TF', 'http://purl.obolibrary.org/obo/SO_0001217 (protein_coding_gene)|http://purl.obolibrary.org/obo/NCIT_C17207 (transcription factor)')

lncRNA_pw['regulator_interaction'] = lncRNA_pw['regulator_interaction'].str.replace('Protein-DNA', 'http://purl.obolibrary.org/obo/NCIT_C18755 (dna-protein interaction)')

lncRNA_pw['regulator_effect'] = lncRNA_pw['regulator_effect'].str.replace('promote;promote', 'http://purl.obolibrary.org/obo/NCIT_C61391 (promotion)|http://purl.obolibrary.org/obo/NCIT_C61391 (promotion)')
lncRNA_pw['regulator_effect'] = lncRNA_pw['regulator_effect'].str.replace('promote', 'http://purl.obolibrary.org/obo/NCIT_C61391 (promotion)')
lncRNA_pw['regulator_effect'] = lncRNA_pw['regulator_effect'].str.replace('inhibit', 'http://purl.obolibrary.org/obo/NCIT_C42791 (inhibition)')

lncRNA_pw['target_type'] = lncRNA_pw['target_type'].str.replace(';', '|')
lncRNA_pw['target_type'] = lncRNA_pw['target_type'].str.replace('miRNA', 'http://purl.obolibrary.org/obo/SO_0000276 (mirna)')
lncRNA_pw['target_type'] = lncRNA_pw['target_type'].str.replace('TF', 'http://purl.obolibrary.org/obo/NCIT_C17207 (transcription factor)')
lncRNA_pw['target_type'] = lncRNA_pw['target_type'].str.replace('Protein', 'http://purl.obolibrary.org/obo/PR_000000001 (protein)')

lncRNA_pw['target_effect'] = lncRNA_pw['target_effect'].str.replace(';', '|')
lncRNA_pw['target_effect'] = lncRNA_pw['target_effect'].str.replace('inhibit', 'http://purl.obolibrary.org/obo/NCIT_C42791 (inhibition)')
lncRNA_pw['target_effect'] = lncRNA_pw['target_effect'].str.replace('promote', 'http://purl.obolibrary.org/obo/NCIT_C61391 (promotion)')

lncRNA_pw['functional_mechanism'] = lncRNA_pw['functional_mechanism'].str.replace('Post-transcriptional regulation', 'http://purl.obolibrary.org/obo/NCIT_C18952 (post-transcriptional regulation)')
lncRNA_pw['functional_mechanism'] = lncRNA_pw['functional_mechanism'].str.replace('Transcriptional regulation', 'http://purl.obolibrary.org/obo/NCIT_C19077 (transcriptional regulation)')
lncRNA_pw['functional_mechanism'] = lncRNA_pw['functional_mechanism'].str.replace('Epigenetic regulation', 'http://purl.obolibrary.org/obo/GO_0040029 (epigenetic regulation of gene expression)')

lncRNA_pw['clinical_detail'] = lncRNA_pw['clinical_detail'].str.replace('recurrence', 'http://purl.obolibrary.org/obo/NCIT_C3352 (recurrence)')
lncRNA_pw['clinical_detail'] = lncRNA_pw['clinical_detail'].str.replace('metastasis', 'http://purl.obolibrary.org/obo/NCIT_C19151 (metastasis)')
lncRNA_pw['clinical_detail'] = lncRNA_pw['clinical_detail'].str.replace('drug', 'http://purl.obolibrary.org/obo/CHEBI_23888 (drug)')

lncRNA_pw.context_detail = [replace_with_mondo(item) for item in lncRNA_pw.context_detail]

lncRNA_pw['tissue/cell line'] = lncRNA_pw['tissue/cell line'].str.lower()
lncRNA_pw['tissue/cell line'] = lncRNA_pw['tissue/cell line'].str.replace(';', '|')
lncRNA_pw['tissue/cell line'] = [replace_with_clo(item) for item in lncRNA_pw['tissue/cell line']]
lncRNA_pw['tissue/cell line'] = [replace_with_clo2(item) for item in lncRNA_pw['tissue/cell line']]
lncRNA_pw['tissue/cell line'] = [replace_with_uberon(item) for item in lncRNA_pw['tissue/cell line']]

lncRNA_pw['regulator'] = lncRNA_pw['regulator'].str.replace(';', '|').str.lower()
lncRNA_pw['regulator'] = [replace_with_pro(item) for item in lncRNA_pw['regulator']]

lncRNA_pw['target'] = lncRNA_pw['target'].str.replace('miR', 'hsa-miR')
lncRNA_pw['target'] = [replace_with_mirbase(item) for item in lncRNA_pw['target']]
lncRNA_pw['target'] = lncRNA_pw['target'].str.replace(';', '|').str.lower()
lncRNA_pw['target'] = [replace_with_pro(item) for item in lncRNA_pw['target']]

lncRNA_pw['experimental_method'] = lncRNA_pw['experimental_method'].str.replace('wetern', 'western')
lncRNA_pw['experimental_method'] = lncRNA_pw['experimental_method'].str.replace('assay assay', 'assay')
lncRNA_pw['experimental_method'] = lncRNA_pw['experimental_method'].str.lower()
lncRNA_pw['experimental_method'] = lncRNA_pw['experimental_method'].str.replace(';', '|')
lncRNA_pw['experimental_method'] = [replace_with_ncit(item) for item in lncRNA_pw['experimental_method']]

lncRNA_pw['biological_process'] = lncRNA_pw['biological_process'].str.lower()
lncRNA_pw['biological_process'] = lncRNA_pw['biological_process'].str.replace(';', '|')
lncRNA_pw['biological_process'] = [replace_with_go(item) for item in lncRNA_pw['biological_process']]

lncRNA_pw['drug'] = lncRNA_pw['drug'].str.lower()
lncRNA_pw['drug'] = [replace_with_drugbank(item) for item in lncRNA_pw['drug']]

In [None]:
lncRNA_pw.rename(columns={1:'pathway'},inplace=True)
lncRNA_pw.columns = lncRNA_pw.columns.str.replace('_', ' ').str.capitalize()
lncRNA_pw.rename(columns={'0 y':'lncRNA','1 y x':'Protein', 'Pmid':'References (PMID)', 'Transcript id': 'Transcript ID',
                               '1 y y': 'Regulator'},inplace=True)
lncRNA_pw.insert(0,'lncRNA',lncRNA_pw.pop('lncRNA'))
lncRNA_pw.insert(1,'Pathway',lncRNA_pw.pop('Pathway'))

In [None]:
merge_rows(lncRNA_pw,'lncRNA','Pathway').to_csv(edge_data_location + 'RlncRNA-pw.txt', sep='\t', index=None)

***
### lncRNA-biological process - http://purl.obolibrary.org/obo/RO_0000056 (participates in)

***
* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
lncRNA_gobp2 = LncRNAWiki[LncRNAWiki['biological_context'].notna()]
lncRNA_gobp2 = lncRNA_gobp2.drop(columns=['synonyms','gene_locus','gene_id','genome_variation','tissue/cell line',
                                         'variation_detail','expression','expression_detail','regulator_type',
                                          'regulator','regulator_interaction','regulator_effect',
                                          'experimental_method','molecular_function','clinical_detail',
                                          'drug', 'target_interaction'])
lncRNA_gobp2.biological_context = lncRNA_gobp2.biological_context.str.lower()
lncRNA_gobp2.biological_context = lncRNA_gobp2.biological_context[lncRNA_gobp2.biological_context!='disease']
lncRNA_gobp2 = pd.merge(lncRNA_gobp2, desc_go_map.rename(columns={0:'biological_context'}), on=['biological_context'])
lncRNA_gobp2.drop(columns='biological_context',inplace=True)
lncRNA_gobp2 = pd.merge(lncRNA_gobp2, symbol_entrez_map.rename(columns={'0_x':'symbol'}), on=['symbol'])
lncRNA_gobp2.drop(columns='symbol',inplace=True)
lncRNA_gobp2['0_y'] = lncRNA_gobp2['0_y'].astype(str)+'?lncRNA'
lncRNA_gobp2.rename(columns={'0_y':'geneid',1:'gobp','pmid':'PMID'},inplace=True)
lncRNA_gobp2['Source(s)'] = 'LncRNAWiki'
lncRNA_gobp2.head()

Manual fix.

In [None]:
lncRNA_gobp2['transcript_id'] = lncRNA_gobp2['transcript_id'] .str.replace(',','|')

lncRNA_gobp2.conservation_ortholog = lncRNA_gobp2.conservation_ortholog.str.replace('Mouse', 'http://purl.obolibrary.org/obo/NCBITaxon_10090 (mus musculus)')

lncRNA_gobp2['epigenetic_modification'] = lncRNA_gobp2['epigenetic_modification'].str.replace('DNA methylation','http://purl.obolibrary.org/obo/GO_0006306 (dna methylation)')

lncRNA_gobp2['target_effect'] = lncRNA_gobp2['target_effect'].str.replace('inhibit', 'http://purl.obolibrary.org/obo/NCIT_C42791 (inhibition)')

lncRNA_gobp2['biological_process'] = lncRNA_gobp2['biological_process'].str.replace(
    'Proliferation;Apoptosis', 'http://purl.obolibrary.org/obo/NCIT_C28378 (proliferation)|http://purl.obolibrary.org/obo/NCIT_C17557 (apoptosis)')

lncRNA_gobp2['target_type'] = lncRNA_gobp2['target_type'].str.replace('miRNA', 'http://purl.obolibrary.org/obo/SO_0000276 (mirna)')
lncRNA_gobp2['target_type'] = lncRNA_gobp2['target_type'].str.replace('TF', 'http://purl.obolibrary.org/obo/NCIT_C17207 (transcription factor)')
lncRNA_gobp2['target_type'] = lncRNA_gobp2['target_type'].str.replace('Protein', 'http://purl.obolibrary.org/obo/PR_000000001 (protein)')

lncRNA_gobp2['target'] = lncRNA_gobp2['target'].str.replace('miR-675', 'https://www.mirbase.org/hairpin/MI0005416')
lncRNA_gobp2['target'] = lncRNA_gobp2['target'].str.replace('STAT3', 'http://purl.obolibrary.org/obo/PR_000002089 (stat3)')
lncRNA_gobp2['target'] = lncRNA_gobp2['target'].str.replace('Runx2', 'http://purl.obolibrary.org/obo/PR_000014364 (runx2)')
lncRNA_gobp2['target'] = lncRNA_gobp2['target'].str.replace('miR-320a;miR-383', 'https://www.mirbase.org/hairpin/MI0000542|https://www.mirbase.org/hairpin/MI0000791')

lncRNA_gobp2.context_detail = [replace_with_mondo(item) for item in lncRNA_gobp2.context_detail]

lncRNA_gobp2['pathway'] = lncRNA_gobp2['pathway'].str.lower()
lncRNA_gobp2['pathway'] = lncRNA_gobp2['pathway'].str.replace(';', '|')
lncRNA_gobp2['pathway'] = lncRNA_gobp2['pathway'].str.replace('nf-??b', 'nuclear factor kappa B')
lncRNA_gobp2['pathway'] = lncRNA_gobp2['pathway'].str.replace('/??-catenin', '')
lncRNA_gobp2['pathway'] = lncRNA_gobp2['pathway'].str.replace('pi3k/akt ', '')
lncRNA_gobp2['pathway'] = [replace_with_pw(item) for item in lncRNA_gobp2['pathway']]

In [None]:
lncRNA_gobp2.insert(0,'gobp',lncRNA_gobp2.pop('gobp'))
lncRNA_gobp2.insert(0,'geneid',lncRNA_gobp2.pop('geneid'))
lncRNA_gobp2.rename(columns={'biological_process':'biological_process2'},inplace=True)
lncRNA_gobp2.columns = lncRNA_gobp2.columns.str.replace('_', ' ').str.capitalize()
lncRNA_gobp2.rename(columns={'Geneid':'lncRNA','Pmid':'References (PMID)', 'Transcript id': 'Transcript ID',
                               'Gobp': 'Biological process'},inplace=True)

In [None]:
merge_rows(lncRNA_gobp2,'lncRNA','Biological process').to_csv(edge_data_location + 'RlncRNA-gobp.txt', sep='\t', index=None)

In [None]:
uberon_dict = dict(zip(desc_uberon_map[0], 'http://purl.obolibrary.org/obo/' + desc_uberon_map[1] +
                      ' (' + desc_uberon_map[0] + ')'))

def replace_with_uberon(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([uberon_dict.get(part, part) for part in substring.split('|')])
    
ncit_dict = dict(zip(desc_ncit_map[0], 'http://purl.obolibrary.org/obo/' + desc_ncit_map[1] +
                      ' (' + desc_ncit_map[0] + ')'))

clo_dict = dict(zip(desc_clo_map[0], 'http://purl.obolibrary.org/obo/' + desc_clo_map[1] +
                    ' (' + desc_clo_map[0] + ')'))

def replace_with_clo(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([clo_dict.get(part, part) for part in substring.split('|')])
    
clo_dict2 = dict(zip(desc_clo_map[0].str.replace(' cell',''), 'http://purl.obolibrary.org/obo/' +
                     desc_clo_map[1] + ' (' + desc_clo_map[0] + ')'))

def replace_with_clo2(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([clo_dict2.get(part, part) for part in substring.split('|')])

def replace_with_ncit(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([ncit_dict.get(part, part) for part in substring.split('|')])

mondo_dict = dict(zip(desc_disPhe_map[0], 'http://purl.obolibrary.org/obo/' + desc_disPhe_map[1] +
                      ' (' + desc_disPhe_map[0] + ')'))

def replace_with_mondo(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([mondo_dict.get(part, part) for part in substring.split('|')])

pro_dict = dict(zip(desc_pro_map[0], 'http://purl.obolibrary.org/obo/' + desc_pro_map[1] +
                   ' (' + desc_pro_map[0] + ')'))

def replace_with_pro(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([pro_dict.get(part, part) for part in substring.split('|')])
    
def merge_rows(df, column1, column2):
    df = df.drop_duplicates()
    df_merged = df.groupby([column1, column2]).agg(lambda x: '|'.join(set(str(i) for i in x if pd.notnull(i)))).reset_index()
    return df_merged.drop_duplicates()

go_dict = dict(zip(desc_go_map[0], 'http://purl.obolibrary.org/obo/' +
                   desc_go_map[1] + ' (' + desc_go_map[0] + ')'))

def replace_with_go(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([go_dict.get(part, part) for part in substring.split('|')])

pw_dict = dict(zip(desc_reactome_map[0], 'http://purl.obolibrary.org/obo/' + desc_reactome_map[1] +
                  ' (' + desc_reactome_map[0] + ')'))

def replace_with_pw(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([pw_dict.get(part, part) for part in substring.split('|')])

DrugBank = pd.read_csv(unprocessed_data_location + 'drugbank vocabulary.csv') 
DrugBank['Common name'] = DrugBank['Common name'].str.lower()
drugbank_dict = dict(zip(DrugBank['Common name'], 'https://go.drugbank.com/drugs/' + DrugBank['DrugBank ID'] +
                        ' (' + DrugBank['Common name'] + ')'))
def replace_with_drugbank(substring):
    if pd.isna(substring):
        return np.nan
    else:
        return '|'.join([drugbank_dict.get(part, part) for part in substring.split('|')])

***
### miRNA-TF(protein)

* [PuTmiR 1.1](https://www.isical.ac.in/~bioinfo_miu/TF-miRNA1.php) <br/>
PuTmiR is a web server designed for extracting the putative TFs for human miRNAs, as per the requirement of a user, based on genomic locality, i.e., any upstream or downstream region of interest less than 10 kb.

#### http://purl.obolibrary.org/obo/RO_0002528 (is upstream of sequence of)

In [None]:
! wget https://www.isical.ac.in/~bioinfo_miu/UpstreamRegionTF-miRNA1.txt

In [None]:
miRNA_TF_up = pd.read_csv(unprocessed_data_location+'UpstreamRegionTF-miRNA1.txt', sep='\t')
miRNA_TF_up = pd.merge(miRNA_TF_up,symbol_to_pro.rename(columns={'0_x':'TF'}),on='TF')
miRNA_TF_up = pd.merge(miRNA_TF_up,mirna_mirbase_map.rename(columns={2:'name'}),on='name')
miRNA_TF_up.drop(columns=['chromStart','chromEnd','TF','name'],inplace=True)
miRNA_TF_up['Source(s)'] = 'PuTmiR'
miRNA_TF_up

Manual fix of inconsistencies.

In [None]:
miRNA_TF_up['chrom'] = miRNA_TF_up['chrom'].str.lower()
miRNA_TF_up['chrom'] = [replace_with_ncit(item) for item in miRNA_TF_up['chrom']]

In [None]:
miRNA_TF_up.rename(columns={'chrom':'Chromosome','strand':'Strand','chromStartTF':'ChromStartTF',
                            'chromEndTF':'ChromEndTF','Refseq':'Refseq ID','score':'Score','1_y':'TF',0:'miRNA'},inplace=True)
miRNA_TF_up.insert(0,'miRNA',miRNA_TF_up.pop('miRNA'))
miRNA_TF_up.insert(1,'TF',miRNA_TF_up.pop('TF'))

In [None]:
maturemiRNA_TF_up = miRNA_TF_up[(miRNA_TF_up['miRNA'].str.startswith('MIMAT'))]
premiRNA_TF_up = miRNA_TF_up[(miRNA_TF_up['miRNA'].str.startswith('MI')) &
                             (~miRNA_TF_up['miRNA'].str.startswith('MIMAT'))]

maturemiRNA_TF_up.drop_duplicates().to_csv(edge_data_location + 'RmiRNA-TFup.txt', sep='\t', index=None)
premiRNA_TF_up.drop_duplicates().to_csv(edge_data_location + 'RpremiRNA-TFup.txt', sep='\t', index=None)

#### http://purl.obolibrary.org/obo/RO_0002529 (is downstream of sequence of)

In [None]:
! wget https://www.isical.ac.in/~bioinfo_miu/DownstreamRegionTF-miRNA1.txt

In [None]:
miRNA_TF_down = pd.read_csv(unprocessed_data_location+'DownstreamRegionTF-miRNA1.txt', sep='\t')
miRNA_TF_down = pd.merge(miRNA_TF_down,symbol_to_pro.rename(columns={'0_x':'TF'}),on='TF')
miRNA_TF_down = pd.merge(miRNA_TF_down,mirna_mirbase_map.rename(columns={2:'name'}),on='name')
miRNA_TF_down.drop(columns=['chromStart','chromEnd','TF','name'],inplace=True)
miRNA_TF_down['Source(s)'] = 'PuTmiR'
miRNA_TF_down

Manual fix of inconsistencies.

In [None]:
miRNA_TF_down['chrom'] = miRNA_TF_down['chrom'].str.lower()
miRNA_TF_down['chrom'] = [replace_with_ncit(item) for item in miRNA_TF_down['chrom']]

In [None]:
miRNA_TF_down.rename(columns={'chrom':'Chromosome','strand':'Strand','chromStartTF':'ChromStartTF',
                            'chromEndTF':'ChromEndTF','Refseq':'Refseq ID','score':'Score','1_y':'TF',0:'miRNA'},inplace=True)
miRNA_TF_down.insert(0,'miRNA',miRNA_TF_down.pop('miRNA'))
miRNA_TF_down.insert(1,'TF',miRNA_TF_down.pop('TF'))

In [None]:
maturemiRNA_TF_down = miRNA_TF_down[(miRNA_TF_down['miRNA'].str.startswith('MIMAT'))]
premiRNA_TF_down = miRNA_TF_down[(miRNA_TF_down['miRNA'].str.startswith('MI')) &
                                 (~miRNA_TF_down['miRNA'].str.startswith('MIMAT'))]

maturemiRNA_TF_down.drop_duplicates().to_csv(edge_data_location + 'RmiRNA-TFdown.txt', sep='\t', index=None)
premiRNA_TF_down.drop_duplicates().to_csv(edge_data_location + 'RpremiRNA-TFdown.txt', sep='\t', index=None)

***
### premiRNA-TF(protein) - http://purl.obolibrary.org/obo/RO_0002428 (involved in regulation of)

* [miRNet](https://www.mirnet.ca/)

In [None]:
! wget https://www.dropbox.com/s/78r0tazedtkhi5g/miRNet-mir-tf-hsa.csv

In [None]:
miRNA_TF = pd.read_csv(unprocessed_data_location + 'miRNet-mir-tf-hsa.csv').drop(columns=[
    'mirnet','mir_acc','entrez','embl','gene_name','mbv'])
miRNA_TF['Source(s)'] = 'miRNet'
miRNA_TF

***
* [TransmiR](https://www.cuilab.cn/transmir) <br /> TransmiR is a database for transcription factor (TF)-microRNA (miRNA) regulations, through which one can find regulatory relations between TFs and miRNAs.

In [None]:
! wget https://www.cuilab.cn/files/images/transmir2/download/literature/hsa.xlsx

In [None]:
miRNA_TF2 = pd.read_excel(unprocessed_data_location+"hsa.xlsx", header=None)
miRNA_TF2['Source(s)'] = 'TransmiR'
miRNA_TF2

In [None]:
miRNA_TF = pd.merge(miRNA_TF, miRNA_TF2, how='outer', left_on=['mir_id','symbol'], right_on = [1,0])
miRNA_TF[0].fillna(miRNA_TF['symbol'], inplace=True)
miRNA_TF['symbol'].fillna(miRNA_TF[0], inplace=True)
miRNA_TF['mir_id'].fillna(miRNA_TF[1], inplace=True)
miRNA_TF[1].fillna(miRNA_TF['mir_id'], inplace=True)

miRNA_TF['Source(s)_x'] = miRNA_TF['Source(s)_x'].astype(str)
miRNA_TF['Source(s)_y'] = miRNA_TF['Source(s)_y'].astype(str)
miRNA_TF['Source(s)'] = miRNA_TF['Source(s)_x'] + '|' + miRNA_TF['Source(s)_y']
miRNA_TF = miRNA_TF.drop(columns=['Source(s)_x', 'Source(s)_y', 1, 0, 8])

miRNA_TF['pmid'] = miRNA_TF['pmid'].astype('Int64').astype(str)
miRNA_TF[5] = miRNA_TF[5].astype('Int64').astype(str)
miRNA_TF['pmid'].fillna(miRNA_TF[5], inplace=True)
miRNA_TF.loc[(miRNA_TF['pmid'] != '<NA>') & (miRNA_TF[5] != '<NA>') &
                 (miRNA_TF['pmid'] != miRNA_TF[5]),
                 ["pmid"]] = miRNA_TF["pmid"].astype(str) + '|' + miRNA_TF[5].astype(str)
miRNA_TF = miRNA_TF.drop(columns=[5,7])

miRNA_TF['action_type'].fillna(miRNA_TF[4], inplace=True)
miRNA_TF = miRNA_TF.drop(columns=[2,3,4,6])

miRNA_TF

***
* [TAM](http://www.lirmed.com/tam2/)

In [None]:
TAM = pd.read_csv(unprocessed_data_location+'mirset_v9.txt', sep='\t',names=range(500))
TAM = TAM.dropna(axis=1, how='all')

miRNA_TF2=TAM[(TAM[0].str.contains("TF"))]
miRNA_TF2=miRNA_TF2.dropna(axis=1, how='all')
miRNA_TF2=miRNA_TF2.drop(columns=[0])
miRNA_TF2['merged'] = miRNA_TF2[miRNA_TF2.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
miRNA_TF2=miRNA_TF2[[1,'merged']]

miRNA_TF2['merged'] = miRNA_TF2.merged.str.split(',')
miRNA_TF2 = miRNA_TF2.explode('merged')
miRNA_TF2['Source(s)'] = 'TAM'
miRNA_TF2

In [None]:
miRNA_TF = pd.merge(miRNA_TF, miRNA_TF2, how='outer', left_on=['mir_id','symbol'], right_on = ['merged',1])

miRNA_TF = pd.merge(miRNA_TF, symbol_to_pro.rename(columns={'0_x':'symbol'}), on=['symbol'])
miRNA_TF = pd.merge(miRNA_TF,miRBaseMap,left_on='mir_id',right_on=[2])

miRNA_TF['Source(s)_x'] = miRNA_TF['Source(s)_x'].astype(str)
miRNA_TF['Source(s)_y'] = miRNA_TF['Source(s)_y'].astype(str)
miRNA_TF['Source(s)'] = miRNA_TF['Source(s)_x'] + '|' + miRNA_TF['Source(s)_y']
miRNA_TF = miRNA_TF.drop(columns=['Source(s)_x', 'Source(s)_y','symbol','mir_id',2,'merged',1])

miRNA_TF['Source(s)'] = miRNA_TF['Source(s)'].str.replace('nan\||\|nan', '', regex=True)

miRNA_TF

Manual fix of inconsistencies.

- Action type

In [None]:
print(miRNA_TF['action_type'].unique()[:5])
miRNA_TF['action_type'] = miRNA_TF['action_type'].str.lower()
miRNA_TF['action_type'] = [replace_with_ncit(item) for item in miRNA_TF['action_type']]
miRNA_TF['action_type'].unique()[:5]

In [None]:
miRNA_TF['pmid'] = 'https://pubmed.ncbi.nlm.nih.gov/' + miRNA_TF['pmid'].astype('Int64', errors='ignore').astype('str')
miRNA_TF['pmid'] = miRNA_TF['pmid'].replace('https://pubmed.ncbi.nlm.nih.gov/<NA>', np.nan)
miRNA_TF.rename(columns={0:'miRNA','1_y':'TF','action_type':'Action type','pmid':'References (PMID)'},inplace=True)
miRNA_TF.insert(0,'miRNA',miRNA_TF.pop('miRNA'))
miRNA_TF.insert(1,'TF',miRNA_TF.pop('TF'))

In [None]:
miRNA_TF.drop_duplicates().to_csv(edge_data_location + 'RpremiRNA-TF.txt', sep='\t', index=None)

***
### miRNA-molecular function - http://purl.obolibrary.org/obo/RO_0000085 (has function)

* [TAM](http://www.lirmed.com/tam2/) <br /> TAM groups miRNAs into six categories of miRNA sets: miRNA-family sets, miRNA cluster sets, miRNA-disease, miRNA-function sets, miRNA-TF sets and tissue specificity sets.

In [None]:
#TAM = pd.read_csv('http://www.lirmed.com/tam2/Public/static/data/mirset_v9.txt', sep='\t',names=range(500))
#TAM=TAM.dropna(axis=1, how='all')

miRNA_GO=TAM[TAM[0].str.contains("unction")]
miRNA_GO[1] = miRNA_GO[1].str.lower()
miRNA_GO=miRNA_GO.dropna(axis=1, how='all')
miRNA_GO=miRNA_GO.drop(columns=[0])
miRNA_GO

In [None]:
miRNA_GO = pd.merge(desc_go_map, miRNA_GO, left_on=[0], right_on=[1]).drop(columns=['1_y'])
miRNA_GO = miRNA_GO.dropna(axis=1, how='all')
miRNA_GO

In [None]:
dflist = list()
for i in range(len(miRNA_GO)):
    df=pd.DataFrame(columns=[0,1,2])
    df[0] = miRNA_GO.T[i].drop(index=[0,'1_x'])
    df[1] = miRNA_GO.T[i][0]
    df[2] = miRNA_GO.T[i]['1_x']
    dflist.append(df)

miRNA_GO=pd.DataFrame(columns=[0,1,2])
for dataframe in dflist:
    miRNA_GO=pd.concat([miRNA_GO,dataframe])
miRNA_GO=miRNA_GO.dropna()

miRNA_GO = pd.merge(miRNA_GO.rename(columns={0:'a'}), mirna_mirbase_map.rename(columns={2:'a'}), on='a')
miRNA_GO.drop(columns=['a',1], inplace=True)
miRNA_GO['Source(s)'] = 'TAM'
miRNA_GO

***
* [miRPathDB](https://mpd.bioinf.uni-sb.de/overview.html) <br /> miRPathDB includes miRNA candidates, experimentally validated target genes, extended analysis functionality, and intuitive visualizations of query results. 

In [None]:
! wget https://mpd.bioinf.uni-sb.de/download/version_2/miRPathDB2_hsa_genetrail2_results.tar.gz
! wget https://mpd.bioinf.uni-sb.de/download/version_2/miRPathDB2_hsa_gmt.tar.gz

In [None]:
tar = tarfile.open(unprocessed_data_location+'miRPathDB2_hsa_gmt.tar.gz', 'r:gz')
tar.extractall(unprocessed_data_location)
tar.close()

miRNA_GO2 = pd.read_csv(unprocessed_data_location + 'hsa/GO_MF_validated_miRTarBase_strong.gmt', sep='\t', header=None,names=range(50))
miRNA_GO2[0] = miRNA_GO2[0].str.lower()
miRNA_GO2=miRNA_GO2.dropna(axis=1, how='all')
miRNA_GO2=miRNA_GO2.drop(columns=[1])
miRNA_GO2 = pd.merge(desc_go_map, miRNA_GO2, left_on=[0], right_on=[0])
miRNA_GO2 = miRNA_GO2.dropna(axis=1, how='all')

dflist = list()
for i in range(len(miRNA_GO2)):
    df=pd.DataFrame(columns=[0,1,2])
    df[0] = miRNA_GO2.T[i].drop(index=[0,1])
    df[1] = miRNA_GO2.T[i][0]
    df[2] = miRNA_GO2.T[i][1]
    dflist.append(df)

miRNA_GO2=pd.DataFrame(columns=[0,1,2])
for dataframe in dflist:
    miRNA_GO2=pd.concat([miRNA_GO2,dataframe])
miRNA_GO2=miRNA_GO2.dropna()

miRNA_GO2 = pd.merge(miRNA_GO2.rename(columns={0:'a'}), mirna_mirbase_map.rename(columns={2:'a'}), on='a')
miRNA_GO2.drop(columns=['a',1], inplace=True)
miRNA_GO2['Source(s)'] = 'miRPathDB'
miRNA_GO2

In [None]:
miRNA_GO = pd.merge(miRNA_GO, miRNA_GO2, how='outer', on=[0,2])

miRNA_GO['Source(s)_x'] = miRNA_GO['Source(s)_x'].astype(str)
miRNA_GO['Source(s)_y'] = miRNA_GO['Source(s)_y'].astype(str)
miRNA_GO['Source(s)'] = miRNA_GO['Source(s)_x'] + '|' + miRNA_GO['Source(s)_y']
miRNA_GO = miRNA_GO.drop(columns=['Source(s)_x', 'Source(s)_y'])

miRNA_GO['Source(s)'] = miRNA_GO['Source(s)'].str.replace('nan\||\|nan', '', regex=True)

miRNA_GO

In [None]:
miRNA_GO.rename(columns={2:'Molecular function',0:'miRNA'},inplace=True)
miRNA_GO = miRNA_GO[['miRNA','Molecular function','Source(s)']]

In [None]:
maturemiRNA_GO = miRNA_GO[miRNA_GO['miRNA'].str.startswith('MIMAT')]
premiRNA_GO = miRNA_GO[~miRNA_GO['miRNA'].str.startswith('MIMAT')]
maturemiRNA_GO.drop_duplicates().to_csv(edge_data_location + 'RmiRNA-GOCC.txt', sep='\t', index=None)
premiRNA_GO.drop_duplicates().to_csv(edge_data_location + 'RpremiRNA-GOCC.txt', sep='\t', index=None)

***
### miRNA-biological process - http://purl.obolibrary.org/obo/RO_0000056 (participates in)

***
* [miRPathDB](https://mpd.bioinf.uni-sb.de/overview.html)

In [None]:
#tar = tarfile.open(unprocessed_data_location+'miRPathDB2_hsa_gmt.tar.gz', 'r:gz')
#tar.extractall(unprocessed_data_location)
#tar.close()

miRNA_GO = pd.read_csv(unprocessed_data_location + 'hsa/GO_BP_validated_miRTarBase_strong.gmt', sep='\t', header=None, names=range(94))
miRNA_GO[0] = miRNA_GO[0].str.lower()
miRNA_GO = miRNA_GO.dropna(axis=1, how='all')
miRNA_GO = miRNA_GO.drop(columns=[1])
miRNA_GO = pd.merge(desc_go_map, miRNA_GO, left_on=[0], right_on=[0])
miRNA_GO = miRNA_GO.dropna(axis=1, how='all')

dflist = list()
for i in range(len(miRNA_GO)):
    df = pd.DataFrame(columns=[0, 1, 2])
    df[0] = miRNA_GO.T[i].drop(index=[0, 1])
    df[1] = miRNA_GO.T[i][0]
    df[2] = miRNA_GO.T[i][1]
    dflist.append(df)

miRNA_GO = pd.DataFrame(columns=[0, 1, 2])
for dataframe in dflist:
    miRNA_GO = pd.concat([miRNA_GO, dataframe])
miRNA_GO = miRNA_GO.dropna()

miRNA_GO = pd.merge(miRNA_GO.rename(columns={0: 'a'}), mirna_mirbase_map.rename(columns={2: 'a'}), on='a')
miRNA_GO.drop(columns=['a', 1], inplace=True)
miRNA_GO['Source(s)'] = 'miRPathDB'
miRNA_GO

In [None]:
miRNA_GO.rename(columns={2:'Biological process',0:'miRNA'},inplace=True)
miRNA_GO = miRNA_GO[['miRNA','Biological process','Source(s)']]

In [None]:
maturemiRNA_GO = miRNA_GO[miRNA_GO['miRNA'].str.startswith('MIMAT')]
premiRNA_GO = miRNA_GO[~miRNA_GO['miRNA'].str.startswith('MIMAT')]
maturemiRNA_GO.drop_duplicates().to_csv(edge_data_location + 'RmiRNA-GOBP.txt', sep='\t', index=None)
premiRNA_GO.drop_duplicates().to_csv(edge_data_location + 'RpremiRNA-GOBP.txt', sep='\t', index=None)

***
### miRNA-cellular component - http://purl.obolibrary.org/obo/RO_0001025 (located in)


***
* [miRPathDB](https://mpd.bioinf.uni-sb.de/overview.html)

In [None]:
#tar = tarfile.open(unprocessed_data_location+'miRPathDB2_hsa_gmt.tar.gz', 'r:gz')
#tar.extractall(unprocessed_data_location)
#tar.close()

miRNA_GO = pd.read_csv(unprocessed_data_location + 'hsa/GO_CC_validated_miRTarBase_strong.gmt', sep='\t', header=None, names=range(94))
miRNA_GO[0] = miRNA_GO[0].str.lower()
miRNA_GO = miRNA_GO.dropna(axis=1, how='all')
miRNA_GO = miRNA_GO.drop(columns=[1])
miRNA_GO = pd.merge(desc_go_map, miRNA_GO, on=[0])
miRNA_GO = miRNA_GO.dropna(axis=1, how='all')

dflist = list()
for i in range(len(miRNA_GO)):
    df = pd.DataFrame(columns=[0, 1, 2])
    df[0] = miRNA_GO.T[i].drop(index=[0, 1])
    df[1] = miRNA_GO.T[i][0]
    df[2] = miRNA_GO.T[i][1]
    dflist.append(df)

miRNA_GO = pd.DataFrame(columns=[0, 1, 2])
for dataframe in dflist:
    miRNA_GO = pd.concat([miRNA_GO, dataframe])
miRNA_GO = miRNA_GO.dropna()

miRNA_GO = pd.merge(miRNA_GO.rename(columns={0: 'a'}), mirna_mirbase_map.rename(columns={2: 'a'}), on='a')
miRNA_GO.drop(columns=['a', 1], inplace=True)
miRNA_GO['Source(s)'] = 'miRPathDB'
miRNA_GO

In [None]:
miRNA_GO.rename(columns={2:'Cellular component',0:'miRNA'},inplace=True)
miRNA_GO = miRNA_GO[['miRNA','Cellular component','Source(s)']]

In [None]:
maturemiRNA_GO = miRNA_GO[miRNA_GO['miRNA'].str.startswith('MIMAT')]
premiRNA_GO = miRNA_GO[~miRNA_GO['miRNA'].str.startswith('MIMAT')]

maturemiRNA_GO.drop_duplicates().to_csv(edge_data_location + 'RmiRNA-GOCC.txt', sep='\t', index=None)
premiRNA_GO.drop_duplicates().to_csv(edge_data_location + 'RpremiRNA-GOCC.txt', sep='\t', index=None)

***
### miRNA-pathway - http://purl.obolibrary.org/obo/RO_0000056 (participates in)

***
* [miRPathDB](https://mpd.bioinf.uni-sb.de/overview.html)

In [None]:
url2 = 'https://raw.githubusercontent.com/ComPath/resources/master/mappings/kegg_reactome.csv'
if not os.path.exists(unprocessed_data_location + 'kegg_reactome.csv'):
    data_downloader(url2, unprocessed_data_location, 'kegg_reactome.csv')

kegg_reactome_map = pd.read_csv(unprocessed_data_location + 'kegg_reactome.csv', header=0, delimiter=',')[['Source Name','Source ID']]
kegg_reactome_map.columns=[0,1]
kegg_reactome_map

In [None]:
miRNA_pw = pd.read_csv(unprocessed_data_location + 'hsa/KEGG_validated_miRTarBase_strong.gmt', sep='\t', header=None, names=range(182))
miRNA_pw = miRNA_pw.dropna(axis=1, how='all')
miRNA_pw = miRNA_pw.drop(columns=[1])
miRNA_pw = pd.merge(kegg_reactome_map, miRNA_pw, on=[0])
miRNA_pw = miRNA_pw.dropna(axis=1, how='all')

dflist = list()
for i in range(len(miRNA_pw)):
    df = pd.DataFrame(columns=[0, 1, 2])
    df[0] = miRNA_pw.T[i].drop(index=[0, 1])
    df[1] = miRNA_pw.T[i][0]
    df[2] = miRNA_pw.T[i][1]
    dflist.append(df)

miRNA_pw = pd.DataFrame(columns=[0, 1, 2])
for dataframe in dflist:
    miRNA_pw = pd.concat([miRNA_pw, dataframe])
miRNA_pw = miRNA_pw.dropna()

miRNA_pw = pd.merge(miRNA_pw.rename(columns={0: 'a'}), mirna_mirbase_map.rename(columns={2: 'a'}), on='a')
miRNA_pw.drop(columns=['a', 1], inplace=True)
miRNA_pw['Source(s)'] = 'miRPathDB'
miRNA_pw.head()

In [None]:
url = 'https://reactome.org/download/current/ReactomePathways.txt'
if not os.path.exists(unprocessed_data_location + 'ReactomePathways.txt'):
    data_downloader(url, unprocessed_data_location)

reactome_pathways = pd.read_csv(unprocessed_data_location + 'ReactomePathways.txt', header=None, delimiter='\t')
# remove all non-human pathways
reactome_pathways = reactome_pathways[reactome_pathways[2] == 'Homo sapiens'][[0,1]]
reactome_pathways.columns=[1,0]
reactome_pathways

In [None]:
#tar = tarfile.open(unprocessed_data_location+'miRPathDB2_hsa_gmt.tar.gz', 'r:gz')
#tar.extractall(unprocessed_data_location)
#tar.close()
miRNA_pw2 = pd.read_csv(unprocessed_data_location + 'hsa/REACTOME_validated_miRTarBase_strong.gmt', sep='\t', header=None, names=range(123))
miRNA_pw2 = miRNA_pw2.drop(columns=[1])
miRNA_pw2 = pd.merge(reactome_pathways, miRNA_pw2, on=[0])
miRNA_pw2 = miRNA_pw2.dropna(axis=1, how='all')

dflist = list()
for i in range(len(miRNA_pw2)):
    df = pd.DataFrame(columns=[0, 1, 2])
    df[0] = miRNA_pw2.T[i].drop(index=[0, 1])
    df[1] = miRNA_pw2.T[i][0]
    df[2] = miRNA_pw2.T[i][1]
    dflist.append(df)

miRNA_pw2 = pd.DataFrame(columns=[0, 1, 2])
for dataframe in dflist:
    miRNA_pw2 = pd.concat([miRNA_pw2, dataframe])
miRNA_pw2 = miRNA_pw2.dropna()

miRNA_pw2 = pd.merge(miRNA_pw2.rename(columns={0: 'a'}), mirna_mirbase_map.rename(columns={2: 'a'}), on='a')
miRNA_pw2.drop(columns=['a', 1], inplace=True)
miRNA_pw2['Source(s)'] = 'miRPathDB'
miRNA_pw2

In [None]:
miRNA_pw = pd.concat([miRNA_pw,miRNA_pw2]).drop_duplicates()

maturemiRNA_pw = miRNA_pw[miRNA_pw[0].str.startswith('MIMAT')]
premiRNA_pw = miRNA_pw[~miRNA_pw[0].str.startswith('MIMAT')]

maturemiRNA_GO.drop_duplicates().to_csv(
    edge_data_location + 'RmiRNA-reactome.txt', header=None, sep='\t', index=None)
premiRNA_GO.drop_duplicates().to_csv(
    edge_data_location + 'RpremiRNA-reactome.txt', header=None, sep='\t', index=None)

In [None]:
url2 = 'https://data.wikipathways.org/current/gmt/wikipathways-20231110-gmt-Homo_sapiens.gmt'
if not os.path.exists(unprocessed_data_location + 'wpw_reactome.csv'):
    data_downloader(url2, unprocessed_data_location, 'wpw_reactome.csv')

desc_wpw_map = pd.read_csv(unprocessed_data_location + 'wpw_reactome.csv', delimiter='\t', names=range(587))[[0,1]]
desc_wpw_map.columns=[0,1]
desc_wpw_map[0] = desc_wpw_map[0].str.replace(r'%WikiPathways_.*$', '', regex=True).str.lower()

desc_wpw_map

In [None]:
miRNA_pw = pd.read_csv(unprocessed_data_location + 'hsa/WIKIPATHWAYS_validated_miRTarBase_strong.gmt', sep='\t', header=None, names=range(148))
miRNA_pw[0] = miRNA_pw[0].str.replace('-', ' ').str.lower()
miRNA_pw[0] = miRNA_pw[0].str.replace('/', ' ')
miRNA_pw[0] = miRNA_pw[0].str.replace(':', ' ')
miRNA_pw[0] = miRNA_pw[0].str.replace(r'\(.*\)', '', regex=True)
miRNA_pw

In [None]:
miRNA_pw = miRNA_pw.dropna(axis=1, how='all')
miRNA_pw = miRNA_pw.drop(columns=[1])
miRNA_pw = pd.merge(desc_wpw_map, miRNA_pw, on=[0])
miRNA_pw = miRNA_pw.dropna(axis=1, how='all')

dflist = list()
for i in range(len(miRNA_pw)):
    df = pd.DataFrame(columns=[0, 1, 2])
    df[0] = miRNA_pw.T[i].drop(index=[0, 1])
    df[1] = miRNA_pw.T[i][0]
    df[2] = miRNA_pw.T[i][1]
    dflist.append(df)

miRNA_pw = pd.DataFrame(columns=[0, 1, 2])
for dataframe in dflist:
    miRNA_pw = pd.concat([miRNA_pw, dataframe])
miRNA_pw = miRNA_pw.dropna()

miRNA_pw = pd.merge(miRNA_pw.rename(columns={0: 'a'}), mirna_mirbase_map.rename(columns={2: 'a'}), on='a')
miRNA_pw.drop(columns=['a', 1], inplace=True)
miRNA_pw['Source(s)'] = 'miRPathDB'
miRNA_pw.head()

In [None]:
miRNA_pw.rename(columns={2:'Pathway',0:'miRNA'},inplace=True)
miRNA_pw = miRNA_pw[['miRNA','Pathway','Source(s)']]

In [None]:
maturemiRNA_pw = miRNA_pw[miRNA_pw['miRNA'].str.startswith('MIMAT')]
premiRNA_pw = miRNA_pw[~miRNA_pw['miRNA'].str.startswith('MIMAT')]

maturemiRNA_pw.drop_duplicates().to_csv(edge_data_location + 'RmiRNA-pw.txt', sep='\t', index=None)
premiRNA_pw.drop_duplicates().to_csv(edge_data_location + 'RpremiRNA-pw.txt', sep='\t', index=None)

***
### premiRNA-premiRNA - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [TAM](http://www.lirmed.com/tam2/)

In [None]:
#TAM = pd.read_csv('http://www.lirmed.com/tam2/Public/static/data/mirset_v9.txt', sep='\t',names=range(500))
#TAM=TAM.dropna(axis=1, how='all')
miRNA_miRNA=pd.concat([TAM[(TAM[0].str.contains("luster"))],TAM[TAM[0].str.contains("amily")]])
miRNA_miRNA[1] = miRNA_miRNA[1].str.lower()
miRNA_miRNA=miRNA_miRNA.dropna(axis=1, how='all')
miRNA_miRNA=miRNA_miRNA.drop(columns=[0,1])
miRNA_miRNA['merged'] = miRNA_miRNA[miRNA_miRNA.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
miRNA_miRNA=miRNA_miRNA[[2,'merged']]

miRNA_miRNA['merged'] = miRNA_miRNA.merged.str.split(',')
miRNA_miRNA = miRNA_miRNA.explode('merged')
miRNA_miRNA

In [None]:
miRNA_miRNA['merged'] = miRNA_miRNA.merged.str.split(',')
miRNA_miRNA = miRNA_miRNA.explode('merged')
miRNA_miRNA['Source(s)'] = 'TAM'
miRNA_miRNA

In [None]:
miRNA_miRNA = pd.merge(mirna_mirbase_map.rename(columns={2: 'merged'}), miRNA_miRNA, on='merged')
miRNA_miRNA = pd.merge(mirna_mirbase_map, miRNA_miRNA, on=2)
miRNA_miRNA.drop(columns=[2,'merged'], inplace=True)
miRNA_miRNA = miRNA_miRNA[~miRNA_miRNA['0_y'].str.startswith('MIMAT')]
miRNA_miRNA

In [None]:
miRNA_miRNA.rename(columns={'0_x':'miRNA1','0_y':'miRNA2'},inplace=True)

In [None]:
miRNA_miRNA.drop_duplicates().to_csv(edge_data_location + 'RpremiRNA-premiRNA.txt', sep='\t', index=None)

***
### miRNA-anatomy - http://purl.obolibrary.org/obo/RO_0001025 (located in)

* [TAM](http://www.lirmed.com/tam2/)

In [None]:
miRNA_anatomy = TAM[(TAM[0].str.contains("TissueSpecific"))]
miRNA_anatomy=miRNA_anatomy.drop(columns=[0])
miRNA_anatomy=miRNA_anatomy.dropna(axis=1, how='all')
miRNA_anatomy=pd.concat([miRNA_anatomy,miRNA_anatomy.loc[1236].to_frame().T])
miRNA_anatomy=miRNA_anatomy.reset_index(drop=True)
miRNA_anatomy.iloc[(3)][1] = "Heart"
miRNA_anatomy.iloc[(6)][1] = "Muscle"
miRNA_anatomy

In [None]:
miRNA_anatomy['merged'] = miRNA_anatomy[miRNA_anatomy.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
miRNA_anatomy['Uberon'] = ['UBERON_0002369', 'UBERON_0000955', 'UBERON_0001155',
                           'UBERON_0002349', 'UBERON_0001150', 'UBERON_0001987', 'UBERON_0001630']
miRNA_anatomy=miRNA_anatomy[[1,'Uberon','merged']]
miRNA_anatomy['merged'] = miRNA_anatomy.merged.str.split(',')
miRNA_anatomy = miRNA_anatomy.explode('merged')
miRNA_anatomy['Source(s)'] = 'TAM'
miRNA_anatomy

In [None]:
miRNA_anatomy = pd.merge(mirna_mirbase_map.rename(columns={2: 'merged'}), miRNA_anatomy, on='merged')
miRNA_anatomy.drop(columns=[1,'merged'], inplace=True)
miRNA_anatomy

In [None]:
miRNA_anatomy.rename(columns={0:'miRNA','Uberon':'Anatomy'},inplace=True)

In [None]:
miRNA_anatomy.drop_duplicates().to_csv(edge_data_location + 'RpremiRNA-anatomy.txt', sep='\t', index=None)

***
### miRNA-chemical - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [SM2miR](http://www.jianglab.cn/SM2miR/) <br /> SM2miR is a manual curated database which collects and incorporates the experimentally validated small molecules' effects on miRNA expression from the published papers. Each entry contains the detailed information about small molecules, miRNAs and their relationships.

In [None]:
data_downloader('http://www.jianglab.cn/SM2miR/files/SM2miR3.xls', unprocessed_data_location)

In [None]:
miRNA_chemical = pd.read_excel(unprocessed_data_location + 'SM2miR3.xls')  
miRNA_chemical = miRNA_chemical[miRNA_chemical['Species'].str.contains('sapiens')]
miRNA_chemical

In [None]:
miRNA_chemical['small melocule'] = miRNA_chemical['small melocule'].str.lower().str.replace("\(.*?\)| \(.*?\)", '').str.rstrip()

miRNA_chemical['small melocule'] = miRNA_chemical['small melocule'].str.split('+')
miRNA_chemical = miRNA_chemical.explode('small melocule')
miRNA_chemical['small melocule'] = miRNA_chemical['small melocule'].str.rstrip().str.lstrip()

miRNA_chemical = pd.merge(miRNA_chemical, desc_chebi_map, left_on=['small melocule'], right_on=[0])

miRNA_chemical=miRNA_chemical.drop(columns=['miRNA','small melocule','FDA','DB','CID','Species','Year','Reference',0])
miRNA_chemical['Source(s)'] = 'SM2miR'
miRNA_chemical

***
* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
! wget https://www.dropbox.com/s/abaeonmjpftbspx/miRNet-mir-mol-hsa.csv?dl=0

In [None]:
miRNA_chemical2 = pd.read_csv(unprocessed_data_location + 'miRNet-mir-mol-hsa.csv?dl=0')
miRNA_chemical2.molecule=miRNA_chemical2.molecule.str.lower()
miRNA_chemical2=miRNA_chemical2.drop(columns=['mirnet','mir_id','drug_bank','pubchem_id'])
miRNA_chemical2

In [None]:
miRNA_chemical2 = pd.merge(miRNA_chemical2, desc_chebi_map, left_on=['molecule'], right_on=[0]).drop(columns=[0,'molecule'])
miRNA_chemical2['Source(s)'] = 'miRNet'
miRNA_chemical2

In [None]:
miRNA_chemical=pd.merge(miRNA_chemical, miRNA_chemical2, how='outer', left_on=['miRBase',1], right_on = ['mir_acc',1])

miRNA_chemical['Source(s)_x'] = miRNA_chemical['Source(s)_x'].astype(str)
miRNA_chemical['Source(s)_y'] = miRNA_chemical['Source(s)_y'].astype(str)
miRNA_chemical['Source(s)'] = miRNA_chemical['Source(s)_x'] + '|' + miRNA_chemical['Source(s)_y']
miRNA_chemical = miRNA_chemical.drop(columns=['Source(s)_x', 'Source(s)_y'])

miRNA_chemical

In [None]:
miRNA_chemical['miRBase'].fillna(miRNA_chemical['mir_acc'], inplace=True)
miRNA_chemical['mir_acc'].fillna(miRNA_chemical['miRBase'], inplace=True)

miRNA_chemical_old=miRNA_chemical.copy()

***
* [miRandola](http://mirandola.iit.cnr.it/index.php)

In [None]:
drug_list=['aspirin','bevacizumab','clopidogrel',
           'conventional%20synthetic%20disease-modifying%20antirheumatic%20drugs%20(cs-dmards)',
           'docetaxel', 'epirubicin%20plus%20paclitaxel','fluorouracil%20(5-fu)','gemcitabine',
           'hypomethylating%20agents%20(hmas)','lapatinib','lithium','mercury','n-acetyl%20cysteine%20(nac)',
           'paracetamol','platinum','praziquantel%20(pzq)','sorafenib','testosterone',
           'transarterial%20chemoembolization%20(tace)','trastuzumab','xuezhikang'
          ]
miRNA_chemical_mirandola=[]
for drug in drug_list:
    miRNA_chemical = pd.read_html('http://mirandola.iit.cnr.it/view_drug.php?LV='+drug)
    for miRNA in range(len(miRNA_chemical)):
        miRNA_chemical = pd.read_html('http://mirandola.iit.cnr.it/view_drug.php?LV='+drug,header=0)[miRNA]
        miRNA_chemical = miRNA_chemical.T
        miRNA_chemical.columns = miRNA_chemical.iloc[0]
        miRNA_chemical = miRNA_chemical.drop(index=miRNA_chemical.iloc[0].name)
        miRNA_chemical = miRNA_chemical.drop(index=miRNA_chemical.iloc[1].name)
        miRNA_chemical = miRNA_chemical.drop(miRNA_chemical.iloc[:, :16],axis = 1)
        miRNA_chemical_mirandola.append(miRNA_chemical) 

miRNA_chemical_mirandola = pd.concat(miRNA_chemical_mirandola)
miRNA_chemical_mirandola=miRNA_chemical_mirandola.drop(columns=['RNA from literature','RNA class', 'miRBase ID',
                                                                'miRBase family', 'Organism','First Author','Journal',
                                                                'Title','Year of publication','Data imported from external databases?'])
miRNA_chemical_mirandola        

In [None]:
miRNA_chemical_mirandola = miRNA_chemical_mirandola.reset_index(drop=True)
miRNA_chemical_mirandola = pd.merge(miRNA_chemical_mirandola, desc_chebi_map, left_on=['Drug'], right_on=[0]).drop(columns=[0])
miRNA_chemical_mirandola['Source(s)'] = 'miRandola'
miRNA_chemical_mirandola

In [None]:
miRNA_chemical=pd.merge(miRNA_chemical_old, miRNA_chemical_mirandola, how='outer', left_on=['mir_acc',1], right_on = ['miRBase Accession',1])
miRNA_chemical['mir_acc'].fillna(miRNA_chemical['miRBase Accession'], inplace=True)
miRNA_chemical['miRBase Accession'].fillna(miRNA_chemical['mir_acc'], inplace=True)

miRNA_chemical['Source(s)_x'] = miRNA_chemical['Source(s)_x'].astype(str)
miRNA_chemical['Source(s)_y'] = miRNA_chemical['Source(s)_y'].astype(str)
miRNA_chemical['Source(s)'] = miRNA_chemical['Source(s)_x'] + '|' + miRNA_chemical['Source(s)_y']
miRNA_chemical = miRNA_chemical.drop(columns=['Source(s)_x', 'Source(s)_y', 'mir_acc', 'miRBase', 'Drug'])

miRNA_chemical['Source(s)'] = miRNA_chemical['Source(s)'].str.replace('nan\||\|nan', '', regex=True)
miRNA_chemical

Manual fix.

In [None]:
miRNA_chemical['Condition'] = miRNA_chemical['Condition'].astype(str)
miRNA_chemical['condition'] = miRNA_chemical['condition'].astype(str)
miRNA_chemical['Condition'].fillna(miRNA_chemical['condition'], inplace=True)
miRNA_chemical.loc[(~miRNA_chemical['Condition'].isna()) & (~miRNA_chemical['condition'].isna()) &
                   (miRNA_chemical['Condition'] != miRNA_chemical['condition']),
                   ["Condition"]] = miRNA_chemical["Condition"
                                                  ].astype(str) + '|' + miRNA_chemical['condition'].astype(str)

miRNA_chemical['expression'] = miRNA_chemical['expression'].str.replace('up-regulated', 'http://purl.obolibrary.org/obo/OMIT_0016489 (up-regulation)')
miRNA_chemical['expression'] = miRNA_chemical['expression'].str.replace('down-regulated', 'http://purl.obolibrary.org/obo/OMIT_0016265 (down-regulation)')
miRNA_chemical['expression'] = miRNA_chemical['expression'].str.replace('up-regualted', 'http://purl.obolibrary.org/obo/OMIT_0016489 (up-regulation)')
miRNA_chemical['expression'] = miRNA_chemical['expression'].str.replace('down-regualted', 'http://purl.obolibrary.org/obo/OMIT_0016265 (down-regulation)')
miRNA_chemical['Expression'] = miRNA_chemical['Expression'].str.replace('up', 'http://purl.obolibrary.org/obo/OMIT_0016489 (up-regulation)')
miRNA_chemical['Expression'] = miRNA_chemical['Expression'].str.replace('down', 'http://purl.obolibrary.org/obo/OMIT_0016265 (down-regulation)')
miRNA_chemical['expression'] = miRNA_chemical['expression'].astype(str)
miRNA_chemical['Expression pattern of miRNA'] = miRNA_chemical['Expression pattern of miRNA'].astype(str)
miRNA_chemical['expression'].fillna(miRNA_chemical['Expression pattern of miRNA'], inplace=True)
miRNA_chemical.loc[(~miRNA_chemical['expression'].isna()) & (~miRNA_chemical['Expression pattern of miRNA'].isna()) &
                   (miRNA_chemical['expression'] != miRNA_chemical['Expression pattern of miRNA']),
                   ["expression"]] = miRNA_chemical["expression"
                                                   ].astype(str) + '|' + miRNA_chemical['Expression pattern of miRNA'].astype(str)

miRNA_chemical['Expression'] = miRNA_chemical['Expression'].astype(str)
miRNA_chemical['expression'].fillna(miRNA_chemical['Expression'], inplace=True)
miRNA_chemical.loc[(~miRNA_chemical['expression'].isna()) & (~miRNA_chemical['Expression'].isna()) &
                   (miRNA_chemical['expression'] != miRNA_chemical['Expression']),
                   ["expression"]] = miRNA_chemical["expression"
                                                   ].astype(str) + '|' + miRNA_chemical['Expression'].astype(str)

miRNA_chemical['PMID'] = miRNA_chemical['PMID'].astype('Int64').astype(str)
miRNA_chemical['PubMed ID'] = miRNA_chemical['PubMed ID'].astype('Int64').astype(str)
miRNA_chemical['PMID'].fillna(miRNA_chemical['PubMed ID'], inplace=True)
miRNA_chemical.loc[(miRNA_chemical['PMID'] != '<NA>') & (miRNA_chemical['PubMed ID'] != '<NA>') &
                   (miRNA_chemical['PMID'] != miRNA_chemical['PubMed ID']),
                   ["PMID"]] = miRNA_chemical["PMID"].astype(str) + '|' + miRNA_chemical['PubMed ID'].astype(str)

miRNA_chemical['pmid'] = miRNA_chemical['pmid'].astype('Int64').astype(str)
miRNA_chemical['PMID'].fillna(miRNA_chemical['pmid'], inplace=True)
miRNA_chemical.loc[(miRNA_chemical['PMID'] != '<NA>') & (miRNA_chemical['pmid'] != '<NA>') &
                   (miRNA_chemical['PMID'] != miRNA_chemical['pmid']),
                   ["PMID"]] = miRNA_chemical["PMID"].astype(str) + '|' + miRNA_chemical['pmid'].astype(str)

miRNA_chemical['PMID'] = 'https://pubmed.ncbi.nlm.nih.gov/' + miRNA_chemical['PMID'].astype('Int64', errors='ignore').astype('str')
miRNA_chemical['PMID'] = miRNA_chemical['PMID'].str.replace(".0", '', regex=True)
miRNA_chemical['PMID'] = miRNA_chemical['PMID'].replace('https://pubmed.ncbi.nlm.nih.gov/<NA>', np.nan)

miRNA_chemical = miRNA_chemical.drop(columns=['condition', 'Expression pattern of miRNA',
                                              'pmid', 'PubMed ID', 'Expression'])

miRNA_chemical['Detection method '] = miRNA_chemical['Detection method '].str.replace('nan\||\|nan', '', regex=True)
miRNA_chemical['PMID'] = miRNA_chemical['PMID'].replace('<NA>', np.nan)

miRNA_chemical['Sample'] = miRNA_chemical['Sample'].str.replace('serum',
                                                                'http://purl.obolibrary.org/obo/BTO_0001239 (serum)')
miRNA_chemical['Sample'] = miRNA_chemical['Sample'].str.replace('plasma',
                                                                'http://purl.obolibrary.org/obo/ENVO_01000798 (plasma)')
miRNA_chemical['Sample'] = miRNA_chemical['Sample'].str.replace('culture medium',
                                                                'http://purl.obolibrary.org/obo/OBI_0000079 (culture medium)')

miRNA_chemical['exRNA form'] = miRNA_chemical['exRNA form'].str.replace('microvesicle',
                                                                        'http://purl.obolibrary.org/obo/GO_1990742 (microvescicle)')

miRNA_chemical['Sample source'] = miRNA_chemical['Sample source'].replace('-',np.nan)
miRNA_chemical['Sample source'] = miRNA_chemical['Sample source'].str.replace('hepg2|plc-prf5',
                                                                              'http://www.ebi.ac.uk/efo/EFO_0001187 (hepg2)|plc-prf5')

miRNA_chemical['Diseases, Cell Lines or normal status'] = miRNA_chemical['Diseases, Cell Lines or normal status'
                                                                        ].str.replace('pancreatic ductal adenocarcinoma (pdac)',
                                                                                      'http://purl.obolibrary.org/obo/MONDO_0005184 (pancreatic ductal adenocarcinoma)')
miRNA_chemical['Diseases, Cell Lines or normal status'] = miRNA_chemical['Diseases, Cell Lines or normal status'
                                                                        ].str.replace('troponin-negative non-st elevation acute coronary syndrome (nste-acs)',
                                                                                      'http://purl.obolibrary.org/obo/MONDO_0005542 (acute coronary syndrome)')
miRNA_chemical['Diseases, Cell Lines or normal status'] = miRNA_chemical['Diseases, Cell Lines or normal status'
                                                                        ].str.replace('Breast cancer',
                                                                                      'http://purl.obolibrary.org/obo/MONDO_0007254 (breast cancer)')
miRNA_chemical['Diseases, Cell Lines or normal status'] = miRNA_chemical['Diseases, Cell Lines or normal status'
                                                                        ].str.replace('Acute liver failure',
                                                                                      'http://purl.obolibrary.org/obo/MONDO_0019542 (acute liver failure)')
miRNA_chemical['Diseases, Cell Lines or normal status'] = miRNA_chemical['Diseases, Cell Lines or normal status'
                                                                        ].str.replace('non small cell lung cancer (nsclc)',
                                                                                      'http://purl.obolibrary.org/obo/MONDO_0005233 (non-small cell lung carcinoma)')
miRNA_chemical['Diseases, Cell Lines or normal status'] = miRNA_chemical['Diseases, Cell Lines or normal status'
                                                                        ].str.replace('hepatocellular carcinoma (hcc)',
                                                                                      'http://purl.obolibrary.org/obo/MONDO_0007256 (hepatocellular carcinoma)')

miRNA_chemical['Condition'] = miRNA_chemical['Condition'].str.replace('nan\||\|nan', '', regex=True)

miRNA_chemical

- Detection method.

In [None]:
miRNA_chemical['Detection method '] = miRNA_chemical['Detection method '].str.lower()
miRNA_chemical['method'] = miRNA_chemical['method'].str.lower()
miRNA_chemical['Methods'] = miRNA_chemical['Methods'].str.lower()
miRNA_chemical['Detection method '] = miRNA_chemical['Detection method '].str.replace('quantitative real-time pcr', 'http://purl.obolibrary.org/obo/NCIT_C28408 (quantitative reverse transcriptase pcr)')
miRNA_chemical['Detection method '] = miRNA_chemical['Detection method '].str.replace('qrt-pcr', 'http://purl.obolibrary.org/obo/NCIT_C28408 (quantitative reverse transcriptase pcr)')
miRNA_chemical['method'] = miRNA_chemical['method'].str.replace('quantitative real-time pcr', 'http://purl.obolibrary.org/obo/NCIT_C28408 (quantitative reverse transcriptase pcr)')
miRNA_chemical['method'] = miRNA_chemical['method'].str.replace('qrt-pcr', 'http://purl.obolibrary.org/obo/NCIT_C28408 (quantitative reverse transcriptase pcr)')
miRNA_chemical['Methods'] = miRNA_chemical['Methods'].str.replace('real-time rt-pcr', 'http://purl.obolibrary.org/obo/NCIT_C28408 (quantitative reverse transcriptase pcr)')
miRNA_chemical['Methods'] = miRNA_chemical['Methods'].str.replace('qrt-pcr', 'http://purl.obolibrary.org/obo/NCIT_C28408 (quantitative reverse transcriptase pcr)')

miRNA_chemical['Detection method '] = miRNA_chemical['Detection method '].astype(str)
miRNA_chemical['method'] = miRNA_chemical['method'].astype(str)
miRNA_chemical['Detection method '].fillna(miRNA_chemical['method'], inplace=True)
miRNA_chemical.loc[(~miRNA_chemical['Detection method '].isna()) & (~miRNA_chemical['method'].isna()) &
                   (miRNA_chemical['Detection method '] != miRNA_chemical['method']),
                   ["Detection method "]] = miRNA_chemical["Detection method "
                                                         ].astype(str) + '|' + miRNA_chemical['method'].astype(str)

miRNA_chemical['Detection method '] = miRNA_chemical['Detection method '].astype(str)
miRNA_chemical['Methods'] = miRNA_chemical['Methods'].astype(str)
miRNA_chemical['Detection method '].fillna(miRNA_chemical['Methods'], inplace=True)
miRNA_chemical.loc[(~miRNA_chemical['Detection method '].isna()) & (~miRNA_chemical['Methods'].isna()) &
                   (miRNA_chemical['Detection method '] != miRNA_chemical['Methods']),
                   ["Detection method "]] = miRNA_chemical["Detection method "
                                                         ].astype(str) + '|' + miRNA_chemical['Methods'].astype(str)

miRNA_chemical['Detection method '] = [replace_with_ncit(item) for item in miRNA_chemical['Detection method ']]
miRNA_chemical['Detection method '].unique()[:5]

- Condition

In [None]:
miRNA_chemical['Condition'] = miRNA_chemical['Condition'].str.replace(r's$', '', regex=True)
miRNA_chemical['Condition'] = miRNA_chemical['Condition'].str.replace(r's|', '', regex=True)
miRNA_chemical['Condition'] = miRNA_chemical['Condition'].str.replace(', ', '|')
miRNA_chemical['Condition'] = miRNA_chemical['Condition'].str.replace('- ', '-')
miRNA_chemical['Condition'] = miRNA_chemical['Condition'].str.lower()
miRNA_chemical['Condition'] = [replace_with_clo(item) for item in miRNA_chemical['Condition']]
miRNA_chemical['Condition'] = [replace_with_clo2(item) for item in miRNA_chemical['Condition']]
miRNA_chemical['Condition'] = [replace_with_ncit(item) for item in miRNA_chemical['Condition']]
miRNA_chemical['Condition'].unique()[:5]

In [None]:
miRNA_chemical.rename(columns={'Detection method ':'Detection method','PMID':'References (PMID)', 1:'Chemical',
                               'method':'Method','expression':'Expression','miRBase Accession':'miRNA'},inplace=True) 
miRNA_chemical.insert(1,'miRNA',miRNA_chemical.pop('miRNA'))
miRNA_chemical.insert(0,'Chemical',miRNA_chemical.pop('Chemical'))

In [None]:
merge_rows(miRNA_chemical,'Chemical','miRNA').to_csv(edge_data_location + 'Rchemical-miRNA.txt', sep='\t', index=None)

***
### gRNA-gene - http://purl.obolibrary.org/obo/RO_0011007 (decreases by repression quantity of)

* [Addgene](https://www.addgene.org/)

In [None]:
# copy-paste from https://www.addgene.org/crispr/reference/grna-sequence/#datatable
gRNA_gene = pd.read_csv(unprocessed_data_location + 'grna_sequences_addgene.txt', sep='\t', dtype = {"Plasmid ID":str})  
gRNA_gene.columns=gRNA_gene.columns.str.rstrip()
gRNA_gene = gRNA_gene[gRNA_gene['Target Species'].notna()]
gRNA_gene = gRNA_gene[gRNA_gene['Target Species'].str.contains('apiens')]
gRNA_gene = gRNA_gene[~gRNA_gene['Plasmid ID'].isna()]
gRNA_gene['Plasmid ID'] = 'www.addgene.org/'+gRNA_gene['Plasmid ID'].astype('Int64').astype(str).str.rstrip()
gRNA_gene['Target Gene'] = gRNA_gene['Target Gene'].str.upper().astype(str).str.rstrip()

gRNA_gene.drop(columns=['Target Species','Cas9 Species','Depositor'],inplace=True)
gRNA_gene

In [None]:
gRNA_gene = pd.merge(gRNA_gene, symbol_entrez_map[['0_x','0_y']].rename(columns={"0_x": "Target Gene"}), on='Target Gene').drop(columns=['Target Gene'])
gRNA_gene.insert(0,'0_y',gRNA_gene.pop('0_y'))
gRNA_gene.insert(0,'Plasmid ID',gRNA_gene.pop('Plasmid ID'))

gRNA_gene['Source(s)'] = 'Addgene'

gRNA_gene

In [None]:
gRNA_gene.Application = gRNA_gene.Application.str.replace(' $', '', regex=True)
gRNA_gene.Application.unique()

In [None]:
gRNA_gene['Target Sequence'] = gRNA_gene['Target Sequence'].str.replace(' $', '', regex=True)
gRNA_gene['Target Sequence'] = gRNA_gene['Target Sequence'].str.replace('gRNA1: ', '')
gRNA_gene['Target Sequence'] = gRNA_gene['Target Sequence'].str.replace('gRNA1:', '')
gRNA_gene['Target Sequence'] = gRNA_gene['Target Sequence'].str.replace(', gRNA2', '|')
gRNA_gene['Target Sequence'] = gRNA_gene['Target Sequence'].str.replace(r'; gRNA2:\s*', '|', regex=True)
gRNA_gene['Target Sequence'].unique()

In [None]:
gRNA_gene['PubMed ID'] = 'https://pubmed.ncbi.nlm.nih.gov/' + gRNA_gene['PubMed ID'].astype('Int64', errors='ignore').astype('str')
gRNA_gene['PubMed ID'] = gRNA_gene['PubMed ID'].replace('https://pubmed.ncbi.nlm.nih.gov/<NA>', np.nan)

In [None]:
gRNA_gene.rename(columns={'0_y':'Gene','PubMed ID':'References (PMID)'},inplace=True)

In [None]:
gRNA_gene.drop_duplicates().to_csv(edge_data_location + 'RgRNA-gene.txt', sep='\t', index=None)

***
### ASO-mRNA - http://purl.obolibrary.org/obo/RO_0003002 (represses expression of)

* [eSkip-Finder](https://eskip-finder.org/cgi-bin/input.cgi) <br /> eSkip-Finder is the first machine learning-based design tool and database of antisense oligonucleotides (ASOs) for exon skipping. A significant challenge, however, is the difficulty in selecting an optimal target sequence for exon skipping.

In [None]:
# https://eskip-finder.org/cgi-bin/search.cgi
ASO_mRNA = pd.read_html(unprocessed_data_location + 'eSkip-Finder.html')[2]
ASO_mRNA = ASO_mRNA[ASO_mRNA['Species'] == 'human']
ASO_mRNA = ASO_mRNA[ASO_mRNA['Oligo name in literature'] != 'Null']
ASO_mRNA = ASO_mRNA[ASO_mRNA['confidence level (1:describe to explicitly / 0:speculated from context)']=='1']
ASO_mRNA.drop(columns=['Oligo index in literature',
                       'Oligo sequence /: Cocktail. -: weasel (connected).',
                       'cap of 5 or 3 terminal (Conjugated end is not specified.)',
                       'cap of 5 terminal','cap of 3 terminal',
                       'Species','Oligo chemistry','Literature info (Patent ID) (original)',
                       'Oligo chemistry','Title','Date','Inventor','Assignee/Applicants',
                       'Figure/Table in literature','Appendix','Unnamed: 31'],inplace=True)
ASO_mRNA = pd.merge(ASO_mRNA,symbol_entrez_map.rename(columns={'0_x':'Target gene (RNA)'}), on='Target gene (RNA)')
ASO_mRNA.drop(columns=['Target gene (RNA)'],inplace=True)
ASO_mRNA['0_y'] = ASO_mRNA['0_y'].astype(str) + '?mRNA'
ASO_mRNA['Oligo name in literature'] = ASO_mRNA['Oligo name in literature'].str.strip()
ASO_mRNA['Oligo name in literature'] = ASO_mRNA['Oligo name in literature'].str.replace(' ', '')
ASO_mRNA['Source(s)'] = 'eSkip-Finder'
ASO_mRNA

In [None]:
ASO_mRNA.rename(columns={'0_y':'mRNA','cells used':'Cell line',	'nested pcr?':'Nested PCR (Yes/No)'},inplace=True)
ASO_mRNA.insert(1,'mRNA',ASO_mRNA.pop('mRNA'))
ASO_mRNA.insert(0,'Oligo name in literature',ASO_mRNA.pop('Oligo name in literature'))

In [None]:
merge_rows(ASO_mRNA, 'Oligo name in literature', 'mRNA').to_csv(edge_data_location + 'RASO-mRNA.txt', sep='\t', index=None)

***
### ASO drug-mRNA - http://purl.obolibrary.org/obo/RO_0002430 (involved in negative regulation of)

* [DrugBank](https://go.drugbank.com/categories/DBCAT001709) <br /> DrugBank is a comprehensive, free-to-access, online database containing information on drugs and drug targets. As both a bioinformatics and a cheminformatics resource, it combines detailed drug (i.e. chemical, pharmacological and pharmaceutical) data with comprehensive drug target (i.e. sequence, structure, and pathway) information.

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001709
ASO_mRNA = pd.read_csv(unprocessed_data_location + 'ASO-gene_DrugBank.txt', sep='\t', header=None) 
ASO_mRNA

In [None]:
ASO_mRNA['NCBI']=[338, np.nan, np.nan, np.nan, np.nan, np.nan, 1756, np.nan,
                  338, 211, np.nan, 1756, 1756, np.nan, np.nan, np.nan, np.nan, 7276, np.nan]
ASO_mRNA['NCBI'] = ASO_mRNA['NCBI'].astype('Int64').astype(str) + '?mRNA'
ASO_mRNA['NCBI'] = ASO_mRNA['NCBI'].replace('<NA>?mRNA',np.nan)
ASO_mRNA['DB ID']=['DB05528',
                   'DB05487', 'DB05487', 'DB05487',
                   'DB06759', 'DB06759',
                   'DB06014',
                   'DB13161',
                   'DB14713',
                   'DB15066',
                   'DB15593',
                   'DB15005',
                   'DB14984', 'DB14984', 'DB14984', 'DB14984', 'DB14984',
                   'DB16699', 'DB16699']
ASO_mRNA.drop(columns=[0,3,4], inplace=True)
#ASO_mRNA.insert(2,'Type',ASO_mRNA.pop('Type'))
ASO_mRNA['Source(s)'] = 'DrugBank'
ASO_mRNA

In [None]:
ASO_mRNA.rename(columns={'DB ID':'DrugBank ID','NCBI':'mRNA',2:'Action',1:'Target description'},inplace=True)
ASO_mRNA.insert(0,'DrugBank ID',ASO_mRNA.pop('DrugBank ID'))
ASO_mRNA.insert(1,'mRNA',ASO_mRNA.pop('mRNA'))

In [None]:
ASO_mRNA.dropna().drop_duplicates().to_csv(edge_data_location + 'RASOd-mRNA.txt', sep='\t', index=None)

***
### ASO drug-disease - http://purl.obolibrary.org/obo/RO_0002606 (is substance that treats)

* [DrugBank](https://go.drugbank.com/categories/DBCAT001709)

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001709
ASO_disease = pd.read_csv(unprocessed_data_location + 'ASO-disease_DrugBank.txt', sep='\t') 
pd.set_option('display.max_colwidth', None)
ASO_disease

In [None]:
ASO_disease['DB ID']=['DB05528',
                   'DB05487',
                   'DB06759',
                    'DB13811',
                   'DB06014',
                   'DB13161',
                   'DB14713',
                   'DB15066',
                   'DB15593',
                   'DB15005',
                   'DB14984',
                   'DB16699',
                   'DB14782',
                   'DB18159']
ASO_disease['MONDO']=['MONDO_0018328',
                      'MONDO_0001657,MONDO_0007254',
                      'MONDO_0000878',
                      '<NA>',
                      'MONDO_0010679',
                      'MONDO_0001516',
                      'MONDO_0017132,MONDO_0001824',
                      'MONDO_0002520',
                      'MONDO_0010679',
                      'MONDO_0010679',
                      'MONDO_0010679',
                      'MONDO_0017132',
                      'MONDO_0004976',
                      '<NA>'
                     ]
ASO_disease['MONDO'] = ASO_disease.MONDO.str.split(',')
ASO_disease = ASO_disease.explode('MONDO')
ASO_disease.drop(columns=['Drug','Drug Description'],inplace=True)
ASO_disease['Source(s)'] = 'DrugBank'
ASO_disease = ASO_disease[ASO_disease['MONDO'] != '<NA>']
ASO_disease

In [None]:
ASO_disease.drop_duplicates().to_csv(edge_data_location + 'RASOd-disease.txt', sep='\t', index=None)

***
### ASO drug-protein - http://purl.obolibrary.org/obo/RO_0011007 10002 (decreases by repression quantity of + is carrier of)

* [DrugBank](https://go.drugbank.com/categories/DBCAT001709)

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001709
ASO_protein = pd.read_csv(unprocessed_data_location + 'ASO-gene_DrugBank.txt', sep='\t',header=None) 

ASO_protein['DB ID']=['DB05528',
                   'DB05487', 'DB05487', 'DB05487',
                   'DB06759', 'DB06759',
                   'DB06014',
                   'DB13161',
                   'DB14713',
                   'DB15066',
                   'DB15593',
                   'DB15005',
                   'DB14984', 'DB14984', 'DB14984', 'DB14984', 'DB14984',
                   'DB16699', 'DB16699']
ASO_protein['PRO']=[np.nan, 'PR_000007204', 'PR_000011178', 'PR_000001754', 'PR_Q9BTL4', 'PR_Q16621',
                    np.nan, 'PR_Q16637', np.nan, np.nan, 'PR_P11532', np.nan, np.nan, 'PR_P08684',
                    'PR_P20815', 'PR_P11712', 'PR_P33261', np.nan, 'PR_P02768']
ASO_protein
ASO_protein.drop(columns=[0,'DB ID',3], inplace=True)
ASO_protein['Source(s)'] = 'DrugBank'
ASO_protein

In [None]:
ASO_protein.rename(columns={4:'DrugBank ID','PRO':'Protein',2:'Action',1:'Target description'},inplace=True)
ASO_protein.insert(0,'DrugBank ID',ASO_protein.pop('DrugBank ID'))
ASO_protein.insert(1,'Protein',ASO_protein.pop('Protein'))

In [None]:
ASO_protein.iloc[0:18][['DrugBank ID','Protein']].dropna().to_csv(
    edge_data_location + 'RASOd-protein11007.txt', sep='\t', index=None)

In [None]:
pd.concat([ASO_protein.iloc[(18)],ASO_protein.iloc[(18)]]).to_csv(
    edge_data_location + 'RASOd-protein10002.txt', sep='\t', index=None)

***
### siRNA drug-mRNA - http://purl.obolibrary.org/obo/RO_0002430 (involved in negative regulation of)

* [DrugBank](https://go.drugbank.com/categories/DBCAT005484) 

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT005484
siRNA_mRNA = pd.read_csv(unprocessed_data_location + 'siRNA-gene_DrugBank.txt', sep='\t') 
siRNA_mRNA

In [None]:
siRNA_mRNA['NCBI']=[7276, np.nan, np.nan, 338, 54363, np.nan, np.nan, 7276, np.nan]
siRNA_mRNA['NCBI'] = siRNA_mRNA['NCBI'].astype('Int64').astype(str) + '?mRNA'
siRNA_mRNA['NCBI'] = siRNA_mRNA['NCBI'].replace('<NA>#mRNA','<NA>')
siRNA_mRNA['DB ID']=['DB14582', 'DB14582', 'DB14582',
                     'DB15066',
                     'DB15935', 'DB15935', 'DB15935',
                     'DB16699', 'DB16699']
siRNA_mRNA.drop(columns=['Drug','Target'], inplace=True)
siRNA_mRNA.insert(2,'Type',siRNA_mRNA.pop('Type'))
siRNA_mRNA['Source(s)'] = 'DrugBank'
siRNA_mRNA = siRNA_mRNA[siRNA_mRNA['NCBI'] != '<NA>']
siRNA_mRNA['NCBI'] = siRNA_mRNA['NCBI'].replace('<NA>?mRNA', np.nan)
siRNA_mRNA

In [None]:
siRNA_mRNA.rename(columns={'DB ID':'DrugBank ID','NCBI':'mRNA'},inplace=True)
siRNA_mRNA.insert(0,'DrugBank ID',siRNA_mRNA.pop('DrugBank ID'))
siRNA_mRNA.insert(1,'mRNA',siRNA_mRNA.pop('mRNA'))

In [None]:
siRNA_mRNA.dropna().drop_duplicates().to_csv(edge_data_location + 'RsiRNAd-mRNA.txt', sep='\t', index=None)

***
### siRNA-mRNA - http://purl.obolibrary.org/obo/RO_0002430 (involved in negative regulation of)

* [The MIT/ICBP siRNA Database](http://web.mit.edu/sirna/index.html) <br /> The MIT/ICBP siRNA Database has validated siRNA and shRNA sequences against over 100 genes.

In [None]:
ICBP = pd.read_html('http://web.mit.edu/sirna/sirnas-gene.html')
ICBP = ICBP[1]
ICBP.columns = ICBP.iloc[[0]].squeeze()
ICBP.drop(0, inplace=True)
ICBP

In [None]:
# For post-processing purposes
ICBP[['ID#']] = ICBP[['ID#']] + '.html'

ICBPsiRNA = ICBP.loc[(ICBP['siRNA'] == 'x') & (ICBP['Human'] == 'x')]
ICBPsiRNA['Protein knockdown'] = ICBPsiRNA['Protein knockdown'].replace('---',np.nan)
ICBPsiRNA.drop(columns=['siRNA','shRNA','Mouse','Human'],inplace=True)
ICBPsiRNA['Source(s)'] = 'ICBP siRNA'
ICBPsiRNA

In [None]:
ICBPsiRNA = pd.merge(symbol_entrez_map[['0_x','0_y']].rename(columns={"0_x": "Target Gene"}),
                     ICBPsiRNA, on="Target Gene").drop(columns='Target Gene')
ICBPsiRNA['0_y'] = ICBPsiRNA['0_y'].astype('Int64').astype(str) + '?mRNA'
ICBPsiRNA['0_y'] = ICBPsiRNA['0_y'].replace('<NA>?mRNA', np.nan)
ICBPsiRNA.rename(columns={'0_y':'mRNA', 'ID#':'siRNA','NCBI Probe #':'NCBI Probe'},inplace=True)
ICBPsiRNA['NCBI Probe'] = "https://www.ncbi.nlm.nih.gov/genome/probe/reports/probereport.cgi?uid=" + ICBPsiRNA['NCBI Probe'].astype(str)
ICBPsiRNA['NCBI Probe'] = ICBPsiRNA['NCBI Probe'].replace('https://www.ncbi.nlm.nih.gov/genome/probe/reports/probereport.cgi?uid=nan', np.nan)

In [None]:
ICBPsiRNA.drop_duplicates().to_csv(edge_data_location + 'RsiRNA-mRNA.txt', sep='\t', index=None)

***
### shRNA-mRNA - http://purl.obolibrary.org/obo/RO_0002430 (involved in negative regulation of)

* [The MIT/ICBP siRNA Database](http://web.mit.edu/sirna/index.html)

In [None]:
ICBPshRNA = ICBP.loc[(ICBP['shRNA'] == 'x') & (ICBP['Human'] == 'x')]
ICBPshRNA.drop(columns=['siRNA','shRNA','Mouse','Human','Protein knockdown'],inplace=True)
ICBPshRNA['Source(s)'] = 'ICBP siRNA'
ICBPshRNA['NCBI Probe #'] = ICBPshRNA['NCBI Probe #'].replace('N/A*',np.nan)
# 'NCBI Probe #' contains only nan values
ICBPshRNA = ICBPshRNA.drop(columns=['NCBI Probe #'])
ICBPshRNA.head()

In [None]:
ICBPshRNA = pd.merge(symbol_entrez_map[['0_x','0_y']].rename(columns={"0_x": "Target Gene"}),
                     ICBPshRNA, on="Target Gene").drop(columns='Target Gene')
ICBPshRNA['0_y'] = ICBPshRNA['0_y'].astype('Int64').astype(str) + '?mRNA'
ICBPshRNA['0_y'] = ICBPshRNA['0_y'].replace('<NA>?mRNA', np.nan)

ICBPshRNA.rename(columns={'0_y':'mRNA', 'ID#':'shRNA'},inplace=True)

In [None]:
ICBPshRNA.drop_duplicates().to_csv(edge_data_location + 'RshRNA-mRNA.txt', sep='\t', index=None)

***
### siRNA drug-disease - http://purl.obolibrary.org/obo/RO_0002606 (is substance that treats)

* [DrugBank](https://go.drugbank.com/categories/DBCAT005484) 

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT005484
siRNA_disease = pd.read_csv(unprocessed_data_location + 'siRNA-disease_DrugBank.txt', sep='\t') 
siRNA_disease

In [None]:
siRNA_disease['DB ID']=['DB14582','DB15066','DB15935','DB16699']
siRNA_disease['MONDO']=['MONDO_0017132,MONDO_0001824',
                        'MONDO_0002520',
                        'MONDO_0009823',
                        'MONDO_0017132,MONDO_0001824']

siRNA_disease['MONDO'] = siRNA_disease.MONDO.str.split(',')
siRNA_disease = siRNA_disease.explode('MONDO')
siRNA_disease.drop(columns=['Drug','Drug Description'],inplace=True)
siRNA_disease['Source(s)'] = 'DrugBank'
siRNA_disease

In [None]:
siRNA_disease.rename(columns={'DB ID':'DrugBank ID','MONDO':'Disease'},inplace=True)

In [None]:
siRNA_disease.drop_duplicates().to_csv(edge_data_location + 'RsiRNAd-disease.txt', sep='\t', index=None)

***
### aptamer-protein - http://purl.obolibrary.org/obo/RO_0002436 (molecularly interacts with)

* [Apta-Index](https://www.aptagen.com/apta-index/) <br/>
Apta-index is the most advanced user-friendly database on aptamers. Aptagen does not list this information contained herein as products but as a database of information obtained from the published literature. 

In [None]:
aptamer_protein = pd.read_csv(unprocessed_data_location + 'aptaindex.csv',names=['Name', 'ID', 'Target', 'Sequence']) 
aptamer_protein.Target = aptamer_protein.Target.str.lower()
aptamer_protein = pd.merge(aptamer_protein, desc_pro_map.rename(columns={0:'Target'}),on='Target')
aptamer_protein['ID'] = 'aptamer-details/?id=' + aptamer_protein['ID'].astype(str)
aptamer_protein = aptamer_protein.drop(columns=['Name','Target', 'Sequence'])
aptamer_protein['Source(s)'] = 'Apta-Index'
aptamer_protein

In [None]:
aptamer_protein.rename(columns={'ID':'Aptamer',1:'Protein'},inplace=True)

In [None]:
aptamer_protein.drop_duplicates().to_csv(edge_data_location + 'Raptamer-protein.txt', sep='\t', index=None)

***
### aptamer-chemical - http://purl.obolibrary.org/obo/RO_0002436 (molecularly interacts with)

* [Apta-Index](https://www.aptagen.com/apta-index/)

In [None]:
aptamer_chemical = pd.read_csv(unprocessed_data_location + 'aptaindex.csv',
                               names=['Name', 'ID', 'Target', 'Sequence']) 
aptamer_chemical.Target = aptamer_chemical.Target.str.lower()
aptamer_chemical = pd.merge(aptamer_chemical,
                            desc_chebi_map.rename(columns={0:'Target'}),on='Target')
aptamer_chemical['ID'] = 'aptamer-details/?id=' + aptamer_chemical['ID'].astype(str)
aptamer_chemical = aptamer_chemical.drop(columns=['Name','Target','Sequence'])
aptamer_chemical['Source(s)'] = 'Apta-Index'
aptamer_chemical

In [None]:
aptamer_chemical.rename(columns={'ID':'Aptamer',1:'Chemical'},inplace=True)

In [None]:
aptamer_chemical.drop_duplicates().to_csv(edge_data_location + 'Raptamer-chemical.txt', sep='\t', index=None)

***
### aptamer drug-protein - http://purl.obolibrary.org/obo/RO_0002436 (molecularly interacts with)

* [DrugBank](https://go.drugbank.com/categories/DBCAT001641) 

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001641
aptamer_protein = pd.read_csv(unprocessed_data_location + 'aptamer-protein_DrugBank.txt', sep='\t') 
aptamer_protein['Source(s)'] = 'DrugBank'
aptamer_protein

In [None]:
aptamer_protein['DB ID']=['DB04932', 'DB04932', 'DB04932', 'DB04998', 'DB15165']
aptamer_protein['PRO']=['PR_000001575', 'PR_000001576', 'PR_000001577', 'PR_000001752', 'PR_P01031']
aptamer_protein.drop(columns=['Drug','Target'],inplace=True)
aptamer_protein.insert(2,'Type',aptamer_protein.pop('Type'))
aptamer_protein

In [None]:
aptamer_protein.rename(columns={'DB ID':'DrugBank ID','PRO':'Protein'},inplace=True)
aptamer_protein.insert(0,'DrugBank ID',aptamer_protein.pop('DrugBank ID'))
aptamer_protein.insert(1,'Protein',aptamer_protein.pop('Protein'))

In [None]:
aptamer_protein.drop_duplicates().to_csv(edge_data_location + 'Raptamerd-protein.txt', sep='\t', index=None)

***
### aptamer drug-disease - http://purl.obolibrary.org/obo/RO_0002606 (is substance that treats)

* [DrugBank](https://go.drugbank.com/categories/DBCAT001641) 

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001641
aptamer_disease = pd.read_csv(unprocessed_data_location + 'aptamer-disease_DrugBank.txt', sep='\t') 
aptamer_disease['Source(s)'] = 'DrugBank'
aptamer_disease

In [None]:
aptamer_disease['DB ID']=['DB04932', 'DB04998', 'DB15165']
aptamer_disease['MONDO']=['MONDO_0019514', 'MONDO_0004992,MONDO_0002367,MONDO_0004643,MONDO_0009831','MONDO_0005150']
aptamer_disease['MONDO'] = aptamer_disease.MONDO.str.split(',')
aptamer_disease = aptamer_disease.explode('MONDO')
aptamer_disease.drop(columns=['Drug','Drug Description'],inplace=True)
aptamer_disease

In [None]:
aptamer_disease.rename(columns={'DB ID':'DrugBank ID','MONDO':'Disease'},inplace=True)
aptamer_disease.insert(0,'DrugBank ID',aptamer_disease.pop('DrugBank ID'))
aptamer_disease.insert(1,'Disease',aptamer_disease.pop('Disease'))

In [None]:
aptamer_disease.drop_duplicates().to_csv(edge_data_location + 'Raptamerd-disease.txt', sep='\t', index=None)

***
### mRNA vaccines-disease - http://purl.obolibrary.org/obo/RO_0002606 (is substance that treats)

* [DrugBank](https://go.drugbank.com/categories/DBCAT005631) 

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT005631
mRNAv_disease = pd.read_csv(unprocessed_data_location + 'mRNAv-disease_DrugBank.txt', sep='\t') 
mRNAv_disease['Source(s)'] = 'DrugBank'
mRNAv_disease

In [None]:
mRNAv_disease['DB ID']=['DB15654', 'DB15695', 'DB15696' , 'DB16401', 'DB16402', 'DB17088', 'DB17090', 'DB17095']
mRNAv_disease['MONDO']='MONDO_0100096'
mRNAv_disease.drop(columns=['Drug Description','Drug'],inplace=True)
mRNAv_disease

In [None]:
mRNAv_disease.rename(columns={'DB ID':'DrugBank ID','MONDO':'Disease'},inplace=True)
mRNAv_disease.insert(0,'DrugBank ID',mRNAv_disease.pop('DrugBank ID'))
mRNAv_disease.insert(1,'Disease',mRNAv_disease.pop('Disease'))

In [None]:
mRNAv_disease.drop_duplicates().to_csv(edge_data_location + 'RmRNAv-disease.txt', sep='\t', index=None)

***
### lncRNA-mRNA - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [LncExpDB](https://ngdc.cncb.ac.cn/lncexpdb/) <br /> LncExpDB is a comprehensive database for lncRNA expression. It covers expression profiles of lncRNA genes across various biological contexts, predicts potential functional lncRNAs and their interacting partners, and thus provides essential guidance on experimental design.

In [None]:
# Download from https://ngdc.cncb.ac.cn/lncexpdb/interactions --> Download button
lncRNA_mRNA = pd.read_csv(unprocessed_data_location + 'interaction.txt', sep='\t') 
lncRNA_mRNA = lncRNA_mRNA[lncRNA_mRNA['lncname'].notna()]
lncRNA_mRNA['lncname'] = lncRNA_mRNA.lncname.str.split(',')
lncRNA_mRNA = lncRNA_mRNA.explode('lncname')
lncRNA_mRNA = lncRNA_mRNA.drop(columns=['geneid','pcg','lnclocation','pcglocation','ID'])
lncRNA_mRNA

In [None]:
lncRNA_mRNA = pd.merge(symbol_entrez_map.rename(columns={'0_x':'lncname'}),lncRNA_mRNA,on='lncname')
lncRNA_mRNA = pd.merge(symbol_entrez_map.rename(columns={'0_x':'pcgname'}),lncRNA_mRNA,on='pcgname')
lncRNA_mRNA.drop(columns=['lncname','pcgname'],inplace=True)
lncRNA_mRNA['0_y_y'] = lncRNA_mRNA['0_y_y'].astype(str)+'?lncRNA'
lncRNA_mRNA['0_y_x'] = lncRNA_mRNA['0_y_x'].astype(str)+'?mRNA'
lncRNA_mRNA['Source(s)'] = 'LncExpDB'
lncRNA_mRNA

Manual fix.

In [None]:
lncRNA_mRNA.context = lncRNA_mRNA.context.str.replace(',','|')
lncRNA_mRNA.context = lncRNA_mRNA.context.str.replace('Normal Tissue/Cell Line', 'http://purl.obolibrary.org/obo/UBERON_0000479')
lncRNA_mRNA.context = lncRNA_mRNA.context.str.replace('OrganDevelopment','http://purl.obolibrary.org/obo/GO_0048513')
lncRNA_mRNA.context = lncRNA_mRNA.context.str.replace('PreimplantationEmbryo','http://purl.obolibrary.org/obo/GO_0007566')
lncRNA_mRNA.context = lncRNA_mRNA.context.str.replace('CellDifferentiation','http://purl.obolibrary.org/obo/GO_0030154')
lncRNA_mRNA.context = lncRNA_mRNA.context.str.replace('SubcellularLocation','http://purl.obolibrary.org/obo/GO_0051179')
lncRNA_mRNA.context = lncRNA_mRNA.context.str.replace('Exosome','http://purl.obolibrary.org/obo/GO_0070062')
lncRNA_mRNA.context = lncRNA_mRNA.context.str.replace('CancerCellLine','http://purl.obolibrary.org/obo/CLO_0009828')
lncRNA_mRNA.context = lncRNA_mRNA.context.str.replace('VirusInfection','http://purl.obolibrary.org/obo/MONDO_0005108')
lncRNA_mRNA.context = lncRNA_mRNA.context.str.replace('Circadian','http://purl.obolibrary.org/obo/GO_0007623')

In [None]:
lncRNA_mRNA

In [None]:
lncRNA_mRNA.rename(columns={'0_y_y':'lncRNA','0_y_x':'mRNA','context':'Context',
                            'breadth':'Breadth','distance':'Distance'},inplace=True)
lncRNA_mRNA.insert(0,'lncRNA',lncRNA_mRNA.pop('lncRNA'))

In [None]:
merge_rows(lncRNA_mRNA, 'lncRNA', 'mRNA').to_csv(edge_data_location + 'RlncRNA-mRNA.txt', sep='\t', index=None)

***
### riboswitch-protein - http://purl.obolibrary.org/obo/RO_0002529 (is downstream of sequence of)

* [TBDB](https://tbdb.io/) <br /> 

TBDB contains T-box riboswitch fold prediction, tRNA pairs from host organisms, information regarding T-box riboswitch genetic context, and thermodynamic calculations of putative T-box riboswitch sequences found in nature.

In [None]:
! wget https://tbdb.io/database/tbdb.csv

In [None]:
riboswitch_protein = pd.read_csv(unprocessed_data_location+'tbdb.csv', sep=',') 
riboswitch_protein['Source(s)'] = 'TBDB' 
riboswitch_protein

In [None]:
# For post-processing purposes
riboswitch_protein.unique_name = riboswitch_protein.unique_name+'.html'

riboswitch_protein.downstream_protein = riboswitch_protein.downstream_protein.str.lower()
riboswitch_protein = riboswitch_protein[['Rank','E_value','Score','Bias','CM_accuracy','unique_name',
                   'deltadelta_g','downstream_protein']]

In [None]:
# Fix join columns
riboswitch_protein = pd.merge(riboswitch_protein, desc_pro_map, left_on=['downstream_protein'], right_on=[0])
riboswitch_protein.drop(columns=[0],inplace=True)
riboswitch_protein = riboswitch_protein.drop(columns=['downstream_protein'])
riboswitch_protein

In [None]:
riboswitch_protein.rename(columns={'E_value':'E value','CM_accuracy':'CM accuracy','unique_name':'Riboswitch',
                            'deltadelta_g':'deltadeltaG',1:'Protein'},inplace=True)
riboswitch_protein.insert(0,'Riboswitch',riboswitch_protein.pop('Riboswitch'))
riboswitch_protein.insert(1,'Protein',riboswitch_protein.pop('Protein'))

In [None]:
merge_rows(riboswitch_protein, 'Riboswitch', 'Protein').to_csv(edge_data_location + 'Rriboswitch-protein.txt', sep='\t', index=None)

***
### riboswitch-bacterial strain - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [RSwitch database](https://penchovsky.atwebpages.com/applications.php?page=58) <br /> 
The RSwitch database contains information on using riboswitches as antibacterial drug targets. Each riboswitch represented by the ID, name, aptamer sequences, secondary structures, multiple alignments, consensus motifs, and biochemical pathways.

In [None]:
riboswitch_bactStrain = pd.read_csv(unprocessed_data_location + 'rswitch.csv', header=None) 
riboswitch_bactStrain['Source(s)'] = 'RSwitch'
riboswitch_bactStrain

In [None]:
riboswitch_bactStrain[2].drop_duplicates().to_csv(
    unprocessed_data_location + 'bacteria.txt', header=None, sep='\n', index=None)
# --> https://www.ncbi.nlm.nih.gov/Taxonomy/TaxIdentifier/tax_identifier.cgi (Note that some manual work is needed)

In [None]:
bacteria = pd.read_csv(unprocessed_data_location + 'tax_report.txt', sep='\t\|\t', engine='python') 
bacteria.taxid = bacteria.taxid.astype('Int64')
bacteria

In [None]:
riboswitch_bactStrain = pd.merge(riboswitch_bactStrain.rename(columns={2:'name'}),
                                 bacteria[['name','taxid']],on=['name'])
riboswitch_bactStrain.taxid = 'http://purl.obolibrary.org/obo/NCBITaxon_'+riboswitch_bactStrain.taxid.astype(str)
riboswitch_bactStrain = riboswitch_bactStrain[riboswitch_bactStrain.taxid != 'http://purl.obolibrary.org/obo/NCBITaxon_<NA>']
riboswitch_bactStrain = riboswitch_bactStrain.drop(columns=[1,'name'])
riboswitch_bactStrain

In [None]:
riboswitch_bactStrain.rename(columns={0:'Riboswitch','taxid':'Bacterial strain'},inplace=True)
riboswitch_bactStrain = riboswitch_bactStrain[['Riboswitch', 'Bacterial strain', 'Source(s)']]

In [None]:
riboswitch_bactStrain.drop_duplicates().to_csv(edge_data_location + 'Rriboswitch-bactStrain.txt', sep='\t', index=None)

***
### riboswitch-biological process - http://purl.obolibrary.org/obo/RO_0000056 (participates in)

* [TBDB](https://tbdb.io/) <br /> 

In [None]:
#https://tbdb.io/database/
riboswitch_gobp = pd.read_csv(unprocessed_data_location+'tbdb.csv', sep=',') 
riboswitch_gobp.unique_name = riboswitch_gobp.unique_name+'.html'
riboswitch_gobp['Source(s)'] = 'TBDB'

# Extract only GO terms
gobp = riboswitch_gobp.protein_desc.str.rpartition('[')[2].str.rpartition(']')[0].str.replace(":", "_")
riboswitch_gobp = pd.concat([riboswitch_gobp, gobp.rename('gobp')], axis=1)
riboswitch_gobp = riboswitch_gobp[riboswitch_gobp.gobp.str.contains("GO", na=False)]
riboswitch_gobp[['unique_name', 'gobp']]

In [None]:
riboswitch_gobp.rename(columns={'unique_name':'Riboswitch','gobp':'Biological process'},inplace=True)

In [None]:
riboswitch_gobp[['Riboswitch', 'Biological process']].drop_duplicates().to_csv(
    edge_data_location + 'Rriboswitch-gobp.txt', sep='\t', index=None)

***
### ribozyme-GO - http://purl.obolibrary.org/obo/RO_0000056 (participates in)

* [Ribocentre](https://www.ribocentre.org/) <br />
Ribocentre is designed to contain comprehensive information of all natural ribozymes.

In [None]:
data_downloader('https://www.ribocentre.org/38dffd70-0f9f-499b-b442-be2f6e91a156', unprocessed_data_location)

In [None]:
ribozyme_go = pd.read_csv(unprocessed_data_location + 'Ribocentre - Application.csv') 
ribozyme_go = ribozyme_go[['ribozyme name', 'Description', 'pubmed ID']]
ribozyme_go['Source(s)'] = 'Ribocentre'
ribozyme_go

In [None]:
ribozyme_go['go'] = ['','','GO_0015867', 'GO_0032363', 'GO_0010468', 'GO_0010468', 'GO_0010468', 'GO_2000232',
                         'GO_0010468', 'GO_0010468', 'GO_0003743', '', '', '', '', '', '', '', 'GO_0010468',
                         '', '', '', 'GO_0050790', '', '', '', '', '', 'GO_0050790', '', '', '', '', '']
ribozyme_go.insert(1,'go',ribozyme_go.pop('go'))
ribozyme_go = pd.merge(ribozyme_rfam_map.rename(columns={0:'ribozyme name'}),
                       ribozyme_go, on='ribozyme name').drop(columns='ribozyme name')
ribozyme_go = ribozyme_go[ribozyme_go['go']!='']
ribozyme_go

In [None]:
ribozyme_go['pubmed ID'] = 'https://pubmed.ncbi.nlm.nih.gov/' + ribozyme_go['pubmed ID'].astype('Int64').astype('str')
ribozyme_go.rename(columns={'go':'Biological process',1:'Ribozyme','pubmed ID':'References (PMID)'},inplace=True)

In [None]:
ribozyme_go.to_csv(edge_data_location + 'Rribozyme-GO.txt', sep='\t', index=None)

***
### viral RNA-ribozyme - http://purl.obolibrary.org/obo/RO_0002526 (overlaps sequence of)

* [ViroidDB](https://viroids.org/) <br />
ViroidDB is the most comprehensive collection of viroid, satellite RNA, retrozyme, and deltavirus genome sequences available on the internet. 

In [None]:
data_downloader('https://viroids.org/db/latest/all.json', unprocessed_data_location)

In [None]:
vRNA_ribozyme = pd.read_json(unprocessed_data_location + 'all.json').T 

# Extract ribozymes 
myre = re.compile(r"\n>> .*?\n")
ribozyme = [myre.findall(i) for i in vRNA_ribozyme.ribozymes]
ribozyme = [[j.replace("\n",'').replace(">> ",'') for j in i] for i in ribozyme]

# List of all possible ribozymes (useful for mapping)
a = [i for j in ribozyme for i in j]
set(a)

In [None]:
vRNA_ribozyme = pd.concat([vRNA_ribozyme.reset_index().drop(columns=['index']),
                           pd.Series(ribozyme)], axis=1)
vRNA_ribozyme = vRNA_ribozyme.explode(0)
vRNA_ribozyme[0] = vRNA_ribozyme[0].str.split().str[0]
vRNA_ribozyme.drop(columns=['isolationSource','collectionDate','gc','bioSample','genus','family','identicalSeqs','genBankTitle','displayTitle','length',
                           'sequenceType','nucCompleteness','genotype','segment','publications',
                           'geoLocation','country','usa','submitters','releaseDate','isolate',
                            'genus','family','sequence','structure','type','ribozymes','Cls_ID80',
                            'Cls_ID70','Cls_ID85','Cls_ID75','Cls_ID95','Cls_ID90','sraAccession','submitters','species','host'],
                   inplace=True)
vRNA_ribozyme.insert(0,1,vRNA_ribozyme.pop(0))
vRNA_ribozyme

In [None]:
vRNA_ribozyme = pd.merge(ribozyme_rfam_map,vRNA_ribozyme,left_on=0,right_on=1)
vRNA_ribozyme.drop(columns=[0],inplace=True)
vRNA_ribozyme.insert(1,'accession',vRNA_ribozyme.pop('accession'))
vRNA_ribozyme['Source(s)'] = 'ViroidDB'
vRNA_ribozyme = vRNA_ribozyme.drop(columns=['1_y','moleculeType'])
vRNA_ribozyme

In [None]:
vRNA_ribozyme.rename(columns={'1_x':'Ribozyme','accession':'Viral RNA'},inplace=True)
vRNA_ribozyme = vRNA_ribozyme[['Viral RNA', 'Ribozyme', 'Source(s)']]

In [None]:
vRNA_ribozyme.drop_duplicates().to_csv(edge_data_location + 'RviralRNA-ribozyme.txt', sep='\t', index=None)

***
### circRNA-extracellular form - http://purl.obolibrary.org/obo/RO_0001018 (contained in)

* [miRandola](http://mirandola.iit.cnr.it/) <br /> miRandola is a comprehensive manually curated classification of different extracellular circulating non-coding RNA types.

In [None]:
! wget http://mirandola.iit.cnr.it/download/miRandola_version_02_2017.txt

In [None]:
circRNA_ev = pd.read_csv(unprocessed_data_location+'miRandola_version_02_2017.txt', sep='\t')
circRNA_ev = circRNA_ev[(circRNA_ev['RNA_class'] == 'circRNA') & (circRNA_ev['organism'].str.contains('apiens'))]

# circRNA in miRandola only circulates in blood
circRNA_ev['gocc'] = 'GO_0072562'
circRNA_ev = pd.merge(circRNA_ev, symbol_entrez_map.rename(columns={'0_x':'RNA'}), on='RNA')

circRNA_ev['0_y'] = circRNA_ev['0_y'].astype(str) + '?' + circRNA_ev['RNA_class'].astype(str)
circRNA_ev = circRNA_ev.drop(columns=['RNA','RNA_class','miRBase_accession','miRBase_Last_Version','miRBase_family','circRNA_accession','organism'])
circRNA_ev['Source(s)'] = 'miRandola'
circRNA_ev

Manual fix.

In [None]:
circRNA_ev['sample'] = circRNA_ev['sample'].str.replace('blood','http://purl.obolibrary.org/obo/UBERON_0000178 (blood)')

print(circRNA_ev['method'].unique())
circRNA_ev['method'] = circRNA_ev['method'].str.replace('qPCR','http://purl.obolibrary.org/obo/NCIT_C51962 (qpcr)')

In [None]:
circRNA_ev['PubMed_ID'] = 'https://pubmed.ncbi.nlm.nih.gov/' + circRNA_ev['PubMed_ID'].astype('Int64', errors='ignore').astype('str')
circRNA_ev['PubMed_ID'] = circRNA_ev['PubMed_ID'].replace('https://pubmed.ncbi.nlm.nih.gov/<NA>', np.nan)
circRNA_ev.rename(columns={'0_y':'circRNA','gocc':'Extracellular form','description':'Description',
                           'sample':'Sample', 'disease_or_cell_line': 'Disease/Cell line',
                           'PubMed_ID' : 'References (PMID)', 'method':'Method','exRNA_type':'exRNA type'},inplace=True)
circRNA_ev = circRNA_ev[['circRNA', 'Extracellular form', 'Description', 'Sample', 'Disease/Cell line',
                         'Method', 'exRNA type', 'References (PMID)', 'Source(s)']]

In [None]:
circRNA_ev.to_csv(edge_data_location + 'RcircRNA-gocc.txt', sep='\t', index=None)

***
### Ribozyme-GO - http://purl.obolibrary.org/obo/RO_0000056 | 1025 | 85

* [Rfam](http://rfamlive.xfam.org/) <br /> The Rfam database is a collection of RNA families, each represented by multiple sequence alignments, consensus secondary structures and covariance models.

In [None]:
rfam_GO = pd.DataFrame(data=[['LC ribozyme','family/RF00011'],
                                 ['glmS ribozyme','family/RF00234'],
                                 ['HDV-F-prausnitzii','family/RF02682'],
                                 ['Pistol ribozyme','family/RF02679'],
                                 ['RNAse P','family/RF00009']#,
                                 #['VS ribozyme',''] absent in RFAM
                                 ])

urls = ['http://rfamlive.xfam.org/' + i + '#tabview=tab8' for i in rfam_GO[1]]
df = pd.DataFrame()
for url in urls:
    temp = pd.read_html(url)[4].loc[0]
    df = pd.concat([df, temp])
df.reset_index(inplace=True, drop=True)
df = df[df.index % 2 != 0]
df['ID'] = ['family/RF00011', 'family/RF00234', 'family/RF02682', 'family/RF02679', 'family/RF00009']
df[0] = df[0].str.split('; ')
df = df.explode(0)
df[0] = df[0].str.replace(r'\(.*?\)', '', regex=True)
df[0] = df[0].str.replace(';', '')
df[0] = df[0].str.replace(' ', '')
df[0] = df[0].str.replace(':', '_')
df['Source(s)'] = 'RFAM'
df = df[['ID', 0, 'Source(s)']]
df.rename(columns={'ID': 'Ribozyme', 0:'GO'},inplace=True)
df


In [None]:
df['GO'].unique()

In [None]:
df[(df[['Ribozyme', 'GO']]['GO'] == 'GO_0008033') | (df[['Ribozyme', 'GO']]['GO'] == 'GO_0010468')].rename(
    columns={'GO':'Biological process'}).to_csv(edge_data_location +'Rribozyme-GObp.txt', sep='\t', index=None)

In [None]:
df[(df[['Ribozyme', 'GO']]['GO'] == 'GO_0004526') | (df[['Ribozyme', 'GO']]['GO'] == 'GO_0003824')].rename(
    columns={'GO':'Molecular function'}).to_csv(edge_data_location + 'Rribozyme-GOmf.txt', sep='\t', index=None)

In [None]:
df[(df[['Ribozyme', 'GO']]['GO'] == 'GO_0030677') | (df[['Ribozyme', 'GO']]['GO'] == 'GO_0030680')].rename(
    columns={'GO':'Cellular component'}).to_csv(edge_data_location + 'Rribozyme-GOcc.txt', sep='\t', index=None)

***
### circRNA-miRNA - http://purl.obolibrary.org/obo/RO_0002434 (interacts with)

* [SomamiR DB](https://compbio.uthsc.edu/SomamiR/)

In [None]:
! wget https://compbio.uthsc.edu/SomamiR/download/circRNA_somatic_v2.0.txt.tar.gz

In [None]:
circRNA_miRNA = pd.read_csv(unprocessed_data_location + 'circRNA_somatic_v2.0.txt.tar.gz', sep="\t", header=None, skiprows=[0])
circRNA_miRNA = circRNA_miRNA.drop(columns=[1,5])
circRNA_miRNA['Source(s)'] = 'SomamiR DB'
circRNA_miRNA

***
* [miRNet](https://www.mirnet.ca/)

In [None]:
! wget https://www.dropbox.com/s/oxraur4z5921sg4/miRNet-mir-circRNA.csv?dl=0

In [None]:
circRNA_miRNA2 = pd.read_csv(unprocessed_data_location + 'miRNet-mir-circRNA.csv?dl=0')
circRNA_miRNA2 = circRNA_miRNA2.drop(columns=['mirnet','mbv','embl'])
circRNA_miRNA2['Source(s)'] = 'miRNet'
circRNA_miRNA2 = circRNA_miRNA2.drop(columns=['gene_name'])
circRNA_miRNA2

In [None]:
circRNA_miRNA = pd.merge(circRNA_miRNA, circRNA_miRNA2, how='outer', left_on=[0, 4], right_on=['symbol', 'mir_id'])

circRNA_miRNA[0].fillna(circRNA_miRNA['symbol'], inplace=True)
circRNA_miRNA[4].fillna(circRNA_miRNA['mir_id'], inplace=True)
circRNA_miRNA['mir_id'].fillna(circRNA_miRNA[4], inplace=True)
circRNA_miRNA['symbol'].fillna(circRNA_miRNA[0], inplace=True)

circRNA_miRNA = pd.merge(circRNA_miRNA, symbol_entrez_map.rename(columns={'0_x':0}), on=0)
circRNA_miRNA = pd.merge(circRNA_miRNA, mirna_mirbase_map.rename(columns={2:4, 0:'a'}), on=4)
circRNA_miRNA['0_y'] = circRNA_miRNA['0_y'].astype(str) + '?circRNA'

circRNA_miRNA['Source(s)_x'] = circRNA_miRNA['Source(s)_x'].astype(str)
circRNA_miRNA['Source(s)_y'] = circRNA_miRNA['Source(s)_y'].astype(str)
circRNA_miRNA['Source(s)'] = circRNA_miRNA['Source(s)_x'] + '|' + circRNA_miRNA['Source(s)_y']
circRNA_miRNA = circRNA_miRNA.drop(columns=['Source(s)_x', 'Source(s)_y'])

circRNA_miRNA['Source(s)'] = circRNA_miRNA['Source(s)'].str.replace('nan\||\|nan', '', regex=True)

circRNA_miRNA = circRNA_miRNA.drop(columns=[0,4,3,7,8,'mir_acc','mir_id','symbol','entrez'])
circRNA_miRNA

Manual fix of inconsistencies.

- Disease.

In [None]:
circRNA_miRNA[16] = circRNA_miRNA[16].str.lower()
circRNA_miRNA[16] = circRNA_miRNA[16].str.replace("[ns]", ']')
circRNA_miRNA[16] = circRNA_miRNA[16].str.replace("]", '|')
circRNA_miRNA[16] = circRNA_miRNA[16].str.replace("[", '')
circRNA_miRNA[16] = circRNA_miRNA[16].str.replace(", ", '|')
circRNA_miRNA[16] = circRNA_miRNA[16].str.replace("||", '|')
circRNA_miRNA[16] = circRNA_miRNA[16].replace("|||", np.nan)
circRNA_miRNA[16] = circRNA_miRNA[16].str.replace('_', ' ')
circRNA_miRNA[16] = circRNA_miRNA[16].str.replace('lung/sclc', 'small cell lung carcinoma')
circRNA_miRNA[16] = circRNA_miRNA[16].replace(r'\(.*?\)', '', regex=True)
circRNA_miRNA[16] = circRNA_miRNA[16].replace(r'\|$', '', regex=True)
circRNA_miRNA[16] = [replace_with_mondo(item) for item in circRNA_miRNA[16]]
circRNA_miRNA[16].unique()[:5]

In [None]:
circRNA_miRNA[6] = 'https://pubmed.ncbi.nlm.nih.gov/' + circRNA_miRNA[6].astype('Int64', errors='ignore').astype('str')
circRNA_miRNA[6] = circRNA_miRNA[6].str.replace('.0', '', regex=True)
circRNA_miRNA[6] = circRNA_miRNA[6].str.replace('|', '|https://pubmed.ncbi.nlm.nih.gov/')
circRNA_miRNA[6] = circRNA_miRNA[6].replace('https://pubmed.ncbi.nlm.nih.gov/<NA>', np.nan)

circRNA_miRNA.rename(columns={0:'Disease/Cell line', 6:'References (PMID)', 9:'Type', 10:'Modification',
                              11:'circRNA binding sequence (binding in upper case)', 12:'miRNA binding sequence',
                              13:'Modification2', 15:'Source', 5:'miRNA', 'genbank_id':'GenBank ID',
                              '0_y':'circRNA', 'a':'miRNA', 2:'Mutation', 16:'Disease(s)'},inplace=True)
circRNA_miRNA.insert(0,'circRNA',circRNA_miRNA.pop('circRNA'))
circRNA_miRNA.insert(1,'miRNA',circRNA_miRNA.pop('miRNA'))

In [None]:
circRNA_maturemiRNA = circRNA_miRNA[(circRNA_miRNA['miRNA'].str.startswith('MIMAT'))]
circRNA_premiRNA = circRNA_miRNA[(circRNA_miRNA['miRNA'].str.startswith('MI')) &
                                 (~circRNA_miRNA['miRNA'].str.startswith('MIMAT'))]

merge_rows(circRNA_maturemiRNA, 'circRNA', 'miRNA').to_csv(edge_data_location + 'RcircRNA-miRNA.txt', sep='\t', index=None)
merge_rows(circRNA_premiRNA, 'circRNA', 'miRNA').to_csv(edge_data_location + 'RcircRNA-premiRNA.txt', sep='\t', index=None)

***
### Remove unprocessed raw data

In [None]:
#shutil.rmtree(unprocessed_data_location)

***
#### PheKnowLator works with at least 2 rows (we removed headers) per dataframe.
Here, we make sure every processed TSV has at least 2 rows.

In [None]:
nodes = pd.read_csv('../resources/edge_source_list.txt',sep=', ',header=None, engine='python')
nodes[1] = '../'+nodes[1].astype(str)
nodes

for i in set(nodes[1]):
    #Read every df
    #print(i)
    df = pd.read_csv(i,sep='\t',header=None)
    #If df has one single row, then double it
    if len(df) == 1:
        df.append(df).to_csv(i, header=None, sep='\t', index=None)

## Non-ontological entities
Non-ontological entities' identifiers **must** be added to `subclass_construction_map.pkl`. A proper class has to be chosen and linked as their subClass.

In [None]:
nodes = pd.read_csv('../resources/edge_source_list.txt',sep=', ',header=None, engine='python')[:115]
nodes[['A', 'B']] = nodes[0].str.split('-', 1, expand=True)
a = set(nodes['A'])
b = set(nodes['B'])
print(a.union(b))

In [None]:
# Provided by PKL ecosystem
data_downloader(processed_url+'subclass_construction_map.pkl', '../resources/construction_approach/')

# Load data, print row count, and preview it
nonO_data = pd.read_pickle(r'../resources/construction_approach/'+'subclass_construction_map.pkl')

# For instance, ncbi IDs are mapped to appropriate SO Ontology entries
list(nonO_data.items())[:5]

***
### miRNA sequences

In [None]:
mature_mirna = mirna_mirbase_map[mirna_mirbase_map[0].str.startswith('MIMAT')]
mature_mirna['SO'] = [['SO_0000276']] * len(mature_mirna)

pre_mirna = mirna_mirbase_map[~mirna_mirbase_map[0].str.startswith('MIMAT')]
pre_mirna['SO'] = [['SO_0000647']] * len(pre_mirna)

mirna_mirbase_map = pd.concat([mature_mirna, pre_mirna])

mirna_nonO = mirna_mirbase_map.drop(1, axis=1).set_index(0).to_dict()
nonO_data = {**nonO_data, **mirna_nonO['SO']}

In [None]:
# replace every wrongly inserted # with ? within input TSV
#for i in os.listdir("../resources/edge_dataOLD/"):
 #   if "txt" in i:
  #      with open("../resources/edge_dataOLD/"+i, "rt") as fin:
   #         with open("../resources/edge_data/"+i, "wt") as fout:
    #            for line in fin:
     #               fout.write(line.replace('#', '?'))
    #else:
     #   shutil.copyfile("../resources/edge_dataOLD/"+i, "../resources/edge_data/"+i)

***
### ASO sequences

In [None]:
ASOnonO_data = pd.read_csv('../resources/edge_data/RASO-mRNA.txt',sep='\t',header=None)[0].drop_duplicates()

ASOnonO_data = pd.DataFrame(ASOnonO_data)
ASOnonO_data['SO'] = [['SO_0000644']] * len(ASOnonO_data)
ASOnonO_data = ASOnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **ASOnonO_data['SO']}

***
### ASO drugs

In [None]:
ASOdnonO_data = pd.read_csv('../resources/edge_data/RASOd-mRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/RASOd-disease.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RASOd-protein11007.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RASOd-protein10002.txt',sep='\t',header=None)[0]).drop_duplicates()

ASOdnonO_data = pd.DataFrame(ASOdnonO_data)
ASOdnonO_data['SO'] = [['CHEBI_76720']] * len(ASOdnonO_data)
ASOdnonO_data = ASOdnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **ASOdnonO_data['SO']}

***
### Aptamer drugs

In [None]:
aptamerdnonO_data = pd.read_csv('../resources/edge_data/Raptamerd-protein.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/Raptamerd-disease.txt',sep='\t',header=None)[0]).drop_duplicates()

aptamerdnonO_data = pd.DataFrame(aptamerdnonO_data)
aptamerdnonO_data['SO'] = [['CHEBI_140490']] * len(aptamerdnonO_data)
aptamerdnonO_data = aptamerdnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **aptamerdnonO_data['SO']}

***
### Aptamer sequences

In [None]:
aptamernonO_data = pd.read_csv('../resources/edge_data/Raptamer-protein.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/Raptamer-chemical.txt',sep='\t',header=None)[0]).drop_duplicates()

aptamernonO_data = pd.DataFrame(aptamernonO_data)
aptamernonO_data['SO'] = [['CHEBI_140490']] * len(aptamernonO_data)
aptamernonO_data = aptamernonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **aptamernonO_data['SO']}

***
### circRNA sequences

In [None]:
circRNAnonO_data = pd.read_csv('../resources/edge_data/RcircRNA-disease.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/RcircRNA-gocc.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RcircRNA-miRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RcircRNA-premiRNA.txt',sep='\t',header=None)[0]).drop_duplicates()

circRNAnonO_data = pd.DataFrame(circRNAnonO_data)
circRNAnonO_data['SO'] = [['SO_0002291']] * len(circRNAnonO_data)
circRNAnonO_data = circRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **circRNAnonO_data['SO']}

***
### gRNA sequences

In [None]:
gRNAnonO_data = pd.read_csv('../resources/edge_data/RgRNA-gene.txt',sep='\t',header=None)[0].drop_duplicates().dropna()

gRNAnonO_data = pd.DataFrame(gRNAnonO_data)
gRNAnonO_data['SO'] = [['SO_0000602']] * len(gRNAnonO_data)
gRNAnonO_data = gRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **gRNAnonO_data['SO']}

***
### lncRNA sequences

In [None]:
lncRNAnonO_data = pd.read_csv('../resources/edge_data/RmiRNA-lncRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/RpremiRNA-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-gene.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-disease.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-chemical.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsmallProtein-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-protein.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-expression2245.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-expression2246.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-expression2291.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-role.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-gocc.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-pw.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-gobp.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-mRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/Hgene-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/HlncRNA-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/HlncRNA-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

lncRNAnonO_data = pd.DataFrame(lncRNAnonO_data)
lncRNAnonO_data['SO'] = [['SO_0001877']] * len(lncRNAnonO_data)
lncRNAnonO_data = lncRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **lncRNAnonO_data['SO']}

***
### mRNA vaccines sequences

In [None]:
mRNAvnonO_data = pd.read_csv('../resources/edge_data/RmRNAv-disease.txt',sep='\t',header=None)[0].drop_duplicates()

mRNAvnonO_data = pd.DataFrame(mRNAvnonO_data)
mRNAvnonO_data['SO'] = [['VO_0000186']] * len(mRNAvnonO_data)
mRNAvnonO_data = mRNAvnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **mRNAvnonO_data['SO']}

***
### scaRNA sequences

In [None]:
scaRNAnonO_data = pd.read_csv('../resources/edge_data/RsnoRNA-scaRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/Hgene-scaRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/HscaRNA-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/HscaRNA-cell.txt',sep='\t',header=None)[0]).drop_duplicates().dropna()

scaRNAnonO_data = pd.DataFrame(scaRNAnonO_data)
scaRNAnonO_data['SO'] = [['SO_0002095']] * len(scaRNAnonO_data)
scaRNAnonO_data = scaRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **scaRNAnonO_data['SO']}

***
### scRNA sequences

In [None]:
scRNAnonO_data = pd.read_csv('../resources/edge_data/Hgene-scRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/HscRNA-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/HscRNA-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

scRNAnonO_data = pd.DataFrame(scRNAnonO_data)
scRNAnonO_data['SO'] = [['SO_0000013']] * len(scRNAnonO_data)
scRNAnonO_data = scRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **scRNAnonO_data['SO']}

***
### snRNA sequences

In [None]:
snRNAnonO_data = pd.read_csv('../resources/edge_data/RsnoRNA-snRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/Hgene-snRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/HsnRNA-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/HsnRNA-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

snRNAnonO_data = pd.DataFrame(snRNAnonO_data)
snRNAnonO_data['SO'] = [['SO_0000274']] * len(snRNAnonO_data)
snRNAnonO_data = snRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **snRNAnonO_data['SO']}

***
### tRNA sequences

In [None]:
tRNAnonO_data = pd.read_csv('../resources/edge_data/RtsRNA-tRNA_tRFdb.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/RtsRNA-tRNA_MINTbase.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RtRNA-aminoacid.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-tRNA.txt',sep='\t',header=None)[1]).drop_duplicates().dropna()

tRNAnonO_data = pd.DataFrame(tRNAnonO_data)
tRNAnonO_data['SO'] = [['SO_0000253']] * len(tRNAnonO_data)
tRNAnonO_data = tRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **tRNAnonO_data['SO']}

***
### Retained intron sequences

In [None]:
rinonO_data = pd.read_csv('../resources/edge_data/RsnoRNA-retainedIntron.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/Hgene-retained_intron.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/Hretained_intron-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/Hretained_intron-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

rinonO_data = pd.DataFrame(rinonO_data)
rinonO_data['SO'] = [['SO_0000188']] * len(rinonO_data)
rinonO_data = rinonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **rinonO_data['SO']}

***
### rRNA sequences

In [None]:
rRNAnonO_data = pd.read_csv('../resources/edge_data/RsnoRNA-rRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/Hgene-rRNA.txt',sep='\t',header=None)[1]).drop_duplicates()

rRNAnonO_data = pd.DataFrame(rRNAnonO_data)
rRNAnonO_data['SO'] = [['SO_0000252']] * len(rRNAnonO_data)
rRNAnonO_data = rRNAnonO_data.set_index(1).to_dict()
nonO_data = {**nonO_data, **rRNAnonO_data['SO']}

***
### Pseudogene sequences

In [None]:
pseudononO_data = pd.read_csv('../resources/edge_data/RmiRNA-pseudogene.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/RsnoRNA-pseudogene.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/Hgene-pseudo.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/Hpseudo-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/Hpseudo-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

pseudononO_data = pd.DataFrame(pseudononO_data)
pseudononO_data['SO'] = [['SO_0000336']] * len(pseudononO_data)
pseudononO_data = pseudononO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **pseudononO_data['SO']}

***
### Mitochondrial tRNA sequences

In [None]:
mttRNAnonO_data = pd.read_csv('../resources/edge_data/Hgene-mt_tRNA.txt',sep='\t',header=None)[1].drop_duplicates()

mttRNAnonO_data = pd.DataFrame(mttRNAnonO_data)
mttRNAnonO_data['SO'] = [['SO_0000253','SO_0001272']] * len(mttRNAnonO_data)
mttRNAnonO_data = mttRNAnonO_data.set_index(1).to_dict()
nonO_data = {**nonO_data, **mttRNAnonO_data['SO']}

***
### miscRNA sequences

In [None]:
unknownRNAnonO_data = pd.read_csv('../resources/edge_data/RsnoRNA-miscRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/Hgene-misc_RNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/Hmisc_RNA-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/Hmisc_RNA-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

unknownRNAnonO_data = pd.DataFrame(unknownRNAnonO_data)
unknownRNAnonO_data['SO'] = [['SO_0000356']] * len(unknownRNAnonO_data)
unknownRNAnonO_data = unknownRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **unknownRNAnonO_data['SO']}

***
### mRNA sequences

In [None]:
mRNAnonO_data = pd.read_csv('../resources/edge_data/RpremiRNA-mRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/RmiRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RASO-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RASOd-mRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsiRNAd-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RsiRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RshRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RlncRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/Hgene-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/HmRNA-protein.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/HmRNA-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/HmRNA-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

mRNAnonO_data = pd.DataFrame(mRNAnonO_data)
mRNAnonO_data['SO'] = [['SO_0000234']] * len(mRNAnonO_data)
mRNAnonO_data = mRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **mRNAnonO_data['SO']}

***
### TEC sequences

In [None]:
TECnonO_data = pd.read_csv('../resources/edge_data/Hgene-TEC.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/HTEC-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/HTEC-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

TECnonO_data = pd.DataFrame(TECnonO_data)
TECnonO_data['SO'] = [['SO_0002139']] * len(TECnonO_data)
TECnonO_data = TECnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **TECnonO_data['SO']}

***
### tsRNA sequences

In [None]:
tsRNAnonO_data = pd.read_csv('../resources/edge_data/RtsRNA-miRNA.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/RtsRNA-disease.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RtsRNA-tRNA_tRFdb.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RtsRNA-tRNA_MINTbase.txt',sep='\t',header=None)[0]).drop_duplicates()

tsRNAnonO_data = pd.DataFrame(tsRNAnonO_data)
tsRNAnonO_data['SO'] = [['SO_0000253']] * len(tsRNAnonO_data)
tsRNAnonO_data = tsRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **tsRNAnonO_data['SO']}

***
### Riboswitch sequences

In [None]:
riboswitchnonO_data = pd.read_csv('../resources/edge_data/Rriboswitch-protein.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/Rriboswitch-bactStrain.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/Rriboswitch-gobp.txt',sep='\t',header=None)[0]).drop_duplicates()

riboswitchnonO_data = pd.DataFrame(riboswitchnonO_data)
riboswitchnonO_data['SO'] = [['SO_0000035']] * len(riboswitchnonO_data)
riboswitchnonO_data = riboswitchnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **riboswitchnonO_data['SO']}

***
### Ribozyme sequences

In [None]:
ribozymenonO_data = pd.read_csv('../resources/edge_data/Rribozyme-GO.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/RviralRNA-ribozyme.txt',sep='\t',header=None)[1]).drop_duplicates()

ribozymenonO_data = pd.DataFrame(ribozymenonO_data)
ribozymenonO_data['SO'] = [['SO_0000374']] * len(ribozymenonO_data)
ribozymenonO_data = ribozymenonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **ribozymenonO_data['SO']}

***
### Viral RNA sequences

In [None]:
vRNA_ribozyme.moleculeType.unique()

In [None]:
ssRNA = vRNA_ribozyme[['accession', 'moleculeType']].loc[vRNA_ribozyme['moleculeType'] == 'ssRNA']
ssRNA['SO'] = [['SO_0001199']] * len(ssRNA)
ssRNA

In [None]:
ssRNAnonO_data = ssRNA.set_index('accession').to_dict()
nonO_data = {**nonO_data, **ssRNAnonO_data['SO']}

In [None]:
ssRNAm = vRNA_ribozyme[['accession', 'moleculeType']].loc[vRNA_ribozyme['moleculeType'] == 'ssRNA(-)']
ssRNAm['SO'] = [['SO_0001200']] * len(ssRNAm)
ssRNAmnonO_data = ssRNAm.set_index('accession').to_dict()
nonO_data = {**nonO_data, **ssRNAmnonO_data['SO']}
    
dsRNA = vRNA_ribozyme[['accession', 'moleculeType']].loc[vRNA_ribozyme['moleculeType'] == 'RNA']
dsRNA['SO'] = [['SO_0001169']] * len(dsRNA)
dsRNAnonO_data = dsRNA.set_index('accession').to_dict()
nonO_data = {**nonO_data, **dsRNAnonO_data['SO']}
    
viralRNA = vRNA_ribozyme[['accession', 'moleculeType']].loc[vRNA_ribozyme['moleculeType'].isna()]
viralRNA['SO'] = [['SO_0001041']] * len(viralRNA)
viralRNAnonO_data = viralRNA.set_index('accession').to_dict()
nonO_data = {**nonO_data, **viralRNAnonO_data['SO']}

***
### siRNA sequences

In [None]:
siRNAnonO_data = pd.read_csv('../resources/edge_data/RsiRNA-mRNA.txt',sep='\t',header=None)[0].drop_duplicates()

siRNAnonO_data = pd.DataFrame(siRNAnonO_data)
siRNAnonO_data['SO'] = [['SO_0000646']] * len(siRNAnonO_data)
siRNAnonO_data = siRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **siRNAnonO_data['SO']}

***
### shRNA sequences

In [None]:
shRNAnonO_data = pd.read_csv('../resources/edge_data/RshRNA-mRNA.txt',sep='\t',header=None)[0].drop_duplicates()

shRNAnonO_data = pd.DataFrame(shRNAnonO_data)
shRNAnonO_data['SO'] = [['SO_0002031']] * len(shRNAnonO_data)
shRNAnonO_data = shRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **shRNAnonO_data['SO']}

***
### snoRNA sequences

In [None]:
snoRNAnonO_data = pd.read_csv('../resources/edge_data/RsnoRNA-gene.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/RsnoRNA-premiRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-miRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-snoRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-snoRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-lncRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-snRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-rRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-mRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-tRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-retainedIntron.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-miscRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-scaRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/RsnoRNA-pseudogene.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/Hgene-snoRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/HsnoRNA-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/HsnoRNA-cell.txt',sep='\t',header=None)[0]).drop_duplicates()

snoRNAnonO_data = pd.DataFrame(snoRNAnonO_data)
snoRNAnonO_data['SO'] = [['SO_0000275']] * len(snoRNAnonO_data)
snoRNAnonO_data = snoRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **snoRNAnonO_data['SO']}

***
### Small proteins

In [None]:
spnonO_data = pd.read_csv('../resources/edge_data/RsmallProtein-lncRNA.txt',sep='\t',header=None)[0].drop_duplicates()

spnonO_data = pd.DataFrame(spnonO_data)
spnonO_data['SO'] = [['SO_0000104']] * len(spnonO_data)
spnonO_data = spnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **spnonO_data['SO']}

***
### siRNA drugs

In [None]:
siRNAdnonO_data = pd.read_csv('../resources/edge_data/RsiRNAd-mRNA.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/RsiRNAd-disease.txt',sep='\t',header=None)[0]).drop_duplicates()

siRNAdnonO_data = pd.DataFrame(siRNAdnonO_data)
siRNAdnonO_data['SO'] = [['SO_0002031', 'CHEBI_23888']] * len(siRNAdnonO_data)
siRNAdnonO_data = siRNAdnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **siRNAdnonO_data['SO']}

***
### Biological roles in ChEBI

In [None]:
bio_role = pd.DataFrame(columns = ["role", "ChEBI"])
bio_role['role'] = ['General', 'Tumor-Suppressor-Gene', 'Oncogene']
bio_role['ChEBI'] = [['CHEBI_24432']] * 3
bio_role

In [None]:
role_nonO_data = bio_role.set_index('role').to_dict()
nonO_data = {**nonO_data, **role_nonO_data['ChEBI']}

***
### Epigenetic modifications in GO

In [None]:
#miRNA_epiMod.epi_modification.unique()

In [None]:
epiMod = pd.DataFrame(columns = ["mod", "GO"])
epiMod['mod'] = ['H3K4me3', 'H3K9me2', 'H3K9me3', 'H3K27me3', 'H3K4me', 'H3K79me2', 'H3K4me2',
                 'H3K9me', 'H3K27me', 'H3K36me2', 'H3R17me2']
epiMod['GO'] = [['GO_0016571']] * len(epiMod['mod'])
epiMod

In [None]:
go_nonO_data = epiMod.set_index('mod').to_dict()
nonO_data = {**nonO_data, **go_nonO_data['GO']}

In [None]:
epiMod = pd.DataFrame(columns = ["mod", "GO"])
epiMod['mod'] = ['H3S10P']
epiMod['GO'] = [['GO_0006468']]
epiMod

In [None]:
go_nonO_data = epiMod.set_index('mod').to_dict()
nonO_data = {**nonO_data, **go_nonO_data['GO']}

In [None]:
epiMod = pd.DataFrame(columns = ["mod", "GO"])
epiMod['mod'] = ['H3ac', 'H4ac', 'H3K9ac', 'H5ac', 'H3K4ac', 'H3K14ac']
epiMod['GO'] = [['GO_0016573']] * len(epiMod['mod'])
epiMod

In [None]:
go_nonO_data = epiMod.set_index('mod').to_dict()
nonO_data = {**nonO_data, **go_nonO_data['GO']}

In [None]:
with open('../resources/construction_approach/'+'subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# DO NOT RUN, this cell is only intended to CHECK everything's OK
nonO_data = pd.read_pickle(r'../resources/construction_approach/'+'subclass_construction_map.pkl')

nonO_data.items()