# <p style="text-align: center;">RNA Knowledge Graph Build Data Preparation</p>
    
***
***

**Author:** [ECavalleri](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=emanuele.cavalleri@unimi.it)

**GitHub Repositories:** [RNA-KG](https://github.com/AnacletoLAB/RNA-KG/)
  
<br>  
  
**Purpose:** This notebook serves as a script to download, process, map, and clean data in order to build edges for the RNA-centered Knowledge Graph.

<br>

**Dependencies:**   
- **Scripts**: This notebook utilizes several helper functions, which are stored in the [`data_utils.py`](https://github.com/callahantiff/PheKnowLator/blob/master/pkt_kg/utils/data_utils.py) and [`kg_utils.py`](https://github.com/callahantiff/PheKnowLator/blob/master/pkt_kg/utils/kg_utils.py) scripts.  
- **Data**: All downloaded and generated data sources are provided through [10.5281/zenodo.10078876](https://zenodo.org/doi/10.5281/zenodo.10078876) dedicated repository. <u>This notebook will download everything that is needed for you</u>.  
_____
***

## Set-Up Environment
_____

In [1]:
%%capture
import sys
!{sys.executable} -m pip install -r requirements.txt
sys.path.append('../')

In [2]:
# import needed libraries
import datetime
import glob
import itertools
import networkx
import numpy
import os
import pickle
import re
import requests
import tarfile
import shutil
import pandas as pd
import gffpandas.gffpandas as gffpd
import numpy as np
pd.set_option('display.max_columns', None)
import re
import zipfile
import io
from bs4 import BeautifulSoup
from itertools import permutations

from collections import Counter
from functools import reduce
from rdflib import Graph, Namespace, URIRef, BNode, Literal
from rdflib.namespace import OWL, RDF, RDFS
from reactome2py import content
from tqdm import tqdm
from typing import Dict

from pkt_kg.utils import * 
from builds.ontology_cleaning import *

from Bio import SeqIO, Entrez

from Bio.SeqIO.FastaIO import SimpleFastaParser

from typing import Tuple

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


#### Define Global Variables

In [3]:
# directory to store resources
resource_data_location = '../resources/'    

# directory to use for unprocessed data
unprocessed_data_location = '../resources/processed_data/unprocessed_data/'

# directory to use for unprocessed edge data
unprocessed_edge_data_location = '../resources/processed_data/unprocessed_data/edges/'

# directory to use for processed data
processed_data_location = '../resources/processed_data/'

# directory to write relations data to
relations_data_location = '../resources/relations_data/'

# directory to write ontology data to
ontology_data_location = '../resources/ontologies/'

# directory to write edges data to
edge_data_location = '../resources/edge_data/'

# processed data url 
processed_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/processed_data/'

# owltools location
owltools_location = '../pkt_kg/libs/owltools'

***
***
## IMPORT MAPPING DATASETS  <a class="anchor" id="create-identifier-maps"></a>
***
***

In [4]:
entrez_pro_map = pd.read_csv(processed_data_location+'ENTREZ_GENE_PRO_ONTOLOGY_MAP.txt', header=None, delimiter='\t')[[0,1]]
symbol_ensembl_map = pd.read_csv(processed_data_location+'GENE_SYMBOL_ENSEMBL_TRANSCRIPT_MAP.txt', header=None, delimiter='\t')[[0,1]]
mesh_to_chebi = pd.read_csv(processed_data_location+'MESH_CHEBI_MAP.txt', sep='\t', header=None)
disgenet_mondo_map = pd.read_csv(processed_data_location+'DISEASE_MONDO_MAP.txt', sep='\t', header=None)
phenot_hpo_map = pd.read_csv(processed_data_location+'PHENOTYPE_HPO_MAP.txt', sep='\t', header=None)
disgenet_mondo_hpo_map = pd.concat([disgenet_mondo_map, phenot_hpo_map]).drop_duplicates()
string_pro = pd.read_csv(processed_data_location + 'STRING_PRO_ONTOLOGY_MAP.txt', sep='\t', header=None)[[0,1]]
hpa_gtex_map = pd.read_csv(processed_data_location+'HPA_GTEx_TISSUE_CELL_MAP.txt', header=None, delimiter='\t')
desc_chebi_map = pd.read_csv(processed_data_location+'DESC_CHEBI_MAP.txt', header=None, delimiter='\t')
desc_drugbank_map = pd.read_csv(processed_data_location+'DESC_DRUGBANK_MAP.txt', header=None, delimiter='\t')
desc_chebi_map = pd.concat([desc_chebi_map, desc_drugbank_map]).drop_duplicates()
desc_mondo_map = pd.read_csv(processed_data_location + 'DESC_MONDO_MAP.txt', header=None, sep='\t')
desc_hpo_map = pd.read_csv(processed_data_location + 'DESC_HP_MAP.txt', header=None, sep='\t')
desc_disPhe_map = pd.concat([desc_mondo_map, desc_hpo_map]).drop_duplicates()
desc_go_map = pd.read_csv(processed_data_location+'DESC_GO_MAP.txt', header=None, delimiter='\t')
desc_reactome_map = pd.read_csv(processed_data_location + 'DESC_REACTOME_MAP.txt', header=None, sep='\t')
desc_wpw_map = pd.read_csv(processed_data_location + 'DESC_WIKIPATHWAYS_MAP.txt', header=None, sep='\t')
desc_pw_map = pd.read_csv(processed_data_location+'DESC_PW_MAP.txt', header=None, delimiter='\t')
mirna_mirbase_map = pd.read_csv(processed_data_location + 'MIRNA_MIRBASE_MAP.txt', header=None, sep='\t')
symbol_to_pro = pd.read_csv(processed_data_location + 'GENE_SYMBOL_PRO_ONTOLOGY_MAP.txt', header=None, sep='\t')
symbol_entrez_map = pd.read_csv(processed_data_location+'GENE_SYMBOL_ENTREZ_ID_MAP.txt',header=None, sep='\t')
ensembl_entrezTranscript_map = pd.read_csv(processed_data_location+'ENTREZ_GENE_ENSEMBL_TRANSCRIPT_MAP.txt',header=None, sep='\t')
ensembl_entrezGene_map = pd.read_csv(processed_data_location+'ENSEMBL_GENE_ENTREZ_GENE_MAP.txt',header=None, sep='\t')
desc_uberon_map = pd.read_csv(processed_data_location+'DESC_EXT_MAP.txt', header=None, delimiter='\t')
desc_clo_map = pd.read_csv(processed_data_location+'DESC_CLO_MAP.txt', header=None, delimiter='\t')
desc_anatomyCell_map = pd.concat([desc_uberon_map, desc_clo_map]).drop_duplicates()
desc_pro_map = pd.read_csv(processed_data_location + 'DESC_PR_MAP.txt', header=None, sep='\t')
desc_pro_map_all = pd.read_csv(processed_data_location + 'DESC_PR_ALL_MAP.txt', header=None, sep='\t')
desc_so_map = pd.read_csv(processed_data_location + 'DESC_SO_MAP.txt', header=None, sep='\t')
tRNA_map = pd.read_csv(processed_data_location+'GtRNAdb_MAP.txt',sep='\t',header=None)
tsRNA_map = pd.read_csv(unprocessed_data_location + 'newID_20210202.txt', sep="\t")
cancer_mondo_map = pd.read_csv(processed_data_location + 'TCGA_MONDO_MAP.txt', header=None, sep='\t')
mintbase_tRNA_map = pd.read_csv(processed_data_location + 'tRNA_MINTbase_RNACENTRAL_MAP.txt', header=None, sep='\t')
tRNA_MINTbase_GtRNAdb_map = pd.read_csv(unprocessed_data_location + 'MINTbase-gtRNAdb_mapping.txt',sep='\t')
aa_chebi_map = pd.read_csv(processed_data_location + 'AminoAcid_ChEBI_MAP.txt', header=None, sep='\t')
ribozyme_rfam_map = pd.read_csv(processed_data_location + 'ribozyme_RFAM_MAP.txt', header=None, sep='\t')
circbase_map = pd.read_csv(processed_data_location + 'CIRCBASE_MAP.txt', header=None, sep='\t')
symbol_to_circbase = pd.read_csv(unprocessed_data_location + 'hsa_hg19_circRNA.txt', sep='\t')[['gene symbol','circRNA ID']].drop_duplicates()
unipro_pro_map = pd.read_csv(processed_data_location+'UNIPROT_ACCESSION_PRO_ONTOLOGY_MAP.txt', sep='\t', header=None)[[0,1]].drop_duplicates()

doid_mondo_map = pd.read_csv(processed_data_location+'DOID_MONDO_MAP.txt', header=None, delimiter='\t')
doid_mondo_map[1] = doid_mondo_map[1].str.split(', ')
doid_mondo_map = doid_mondo_map.explode(1)

In [5]:
#rnacentral_map = pd.read_csv(unprocessed_data_location + "id_mapping.tsv", delimiter='\t',
#                             names=['ID', 'DB', 'DB ID', 'Organism', 'RNA category', 'Label'])
rnacentral_map_human = pd.read_csv(processed_data_location + 'RNACENTRAL_MAP.txt', sep='\t')

rnacentral_map_mirbase = pd.read_csv(
    processed_data_location + 'RNAcentral_MAP/mirbase.tsv',
    sep='\t', names=['RNAcentral ID', 'DB', 'miRBase ID', 'Organism', 'miRNA category', 'Label'])
rnacentral_map_human_mirbase = rnacentral_map_mirbase[rnacentral_map_mirbase['Organism'] == 9606].drop(
    columns=['Organism', 'DB', 'miRNA category', 'Label'])

ensembl_map = pd.read_csv(processed_data_location + 'ensembl_identifier_data_cleaned.txt', sep='\t')

rnacentral_map_ensembl = pd.read_csv(
    processed_data_location + 'RNAcentral_MAP/ensembl.tsv',
    sep='\t', names=['RNAcentral ID', 'DB', 'Ensembl transcript ID', 'Organism', 'RNA category', 'Ensembl Gene ID'])
rnacentral_map_human_ensembl = rnacentral_map_ensembl[rnacentral_map_ensembl['Organism'] == 9606].drop(
    columns=['Organism', 'DB', 'RNA category'])
rnacentral_map_human_ensembl['Ensembl Gene ID'] = rnacentral_map_human_ensembl['Ensembl Gene ID'].str.split('.').str[0]

rnacentral_map_lncipedia = pd.read_csv(processed_data_location + "RNAcentral_MAP/lncipedia.tsv",sep='\t',
                                       names=['RNAcentral ID', 'DB', 'LNCipedia transcript ID', 'Organism', 'RNA category', 'LNCipedia Gene ID'])
rnacentral_map_human_lncipedia = rnacentral_map_lncipedia[rnacentral_map_lncipedia['Organism'] == 9606].drop(
    columns=['Organism', 'DB', 'RNA category'])

rnacentral_map_gtrnadb = pd.read_csv(processed_data_location + "RNAcentral_MAP/gtrnadb.tsv",sep='\t',
                                     names=['RNAcentral ID', 'DB', 'GtRNAdb transcript ID', 'Organism', 'RNA category', 'GtRNAdb Gene ID'])
rnacentral_map_human_gtrnadb = rnacentral_map_gtrnadb[rnacentral_map_gtrnadb['Organism'] == 9606].drop(
    columns=['Organism', 'DB', 'RNA category'])

rnacentral_map_hgnc = pd.read_csv(processed_data_location + "RNAcentral_MAP/hgnc.tsv",sep='\t',
                                  names=['RNAcentral ID', 'DB', 'HGNC ID', 'Organism', 'RNA category', 'HGNC symbol'])
rnacentral_map_human_hgnc = rnacentral_map_hgnc[rnacentral_map_hgnc['Organism'] == 9606].drop(
    columns=['Organism', 'DB'])

rnacentral_map_lncbook = pd.read_csv(processed_data_location+"RNAcentral_MAP/lncbook.tsv",sep='\t',
                                     names=['RNAcentral ID', 'DB', 'LncBook Transcript ID', 'Organism', 'RNA category', 'LncBook Gene ID'])
rnacentral_map_human_lncbook = rnacentral_map_lncbook[rnacentral_map_lncbook['Organism'] == 9606].drop(
    columns=['Organism', 'DB', 'RNA category'])

rnacentral_map_noncode = pd.read_csv(processed_data_location + "RNAcentral_MAP/noncode.tsv",sep='\t',
                                     names=['RNAcentral ID', 'DB', 'NONCODE Transcript ID', 'Organism', 'RNA category', 'NONCODE Gene ID'])
rnacentral_map_human_noncode = rnacentral_map_noncode[rnacentral_map_noncode['Organism'] == 9606].drop(
    columns=['Organism', 'DB', 'RNA category'])

rnacentral_map_rfam = pd.read_csv(processed_data_location + "RNAcentral_MAP/rfam.tsv",sep='\t',
                                     names=['RNAcentral ID', 'DB', 'Rfam ID', 'Organism', 'RNA category', "nan"]).drop(columns="nan")
rnacentral_map_human_rfam = rnacentral_map_rfam[rnacentral_map_rfam['Organism'] == 9606].drop(
    columns=['Organism', 'DB', 'RNA category'])

rnacentral_map_refseq = pd.read_csv(processed_data_location + "RNAcentral_MAP/refseq.tsv",sep='\t', 
                                    names=['RNAcentral ID', 'DB', 'RefSeq ID', 'Organism', 'RNA category', "Label"])
ensembl_refseq_map = pd.read_csv('https://ftp.ensembl.org/pub/release-113/tsv/homo_sapiens/Homo_sapiens.GRCh38.113.refseq.tsv.gz', sep='\t')
rnacentral_map_human_refseq = rnacentral_map_refseq[rnacentral_map_refseq['Organism'] == 9606].drop(
    columns=['Organism', 'DB', 'RNA category'])
ensembl_refseq_map = ensembl_refseq_map[~ensembl_refseq_map['xref'].isin(rnacentral_map_human_refseq['RefSeq ID'])]
rnacentral_map_human_refseq = pd.concat(
    [rnacentral_map_human_refseq, ensembl_refseq_map[['transcript_stable_id','xref']].rename(
        columns={'transcript_stable_id':'RNAcentral ID', 'xref':'RefSeq ID'})]).drop_duplicates(subset='RefSeq ID', keep='first')

rnacentral_map_human_pirbase = pd.read_csv(processed_data_location + "RNAcentral_MAP/pirbase.tsv",sep='\t',names=['RNAcentral ID', "piRBase ID"])

In [6]:
# Look-up tables to ground properties: synonyms are mapped to labels
location_map = pd.concat([pd.read_csv(unprocessed_data_location + 'DESC_EXT_MAP.txt', sep='\t', header=None),
                          pd.read_csv(unprocessed_data_location + 'DESC_CLO_MAP.txt', sep='\t', header=None),
                          pd.read_csv(unprocessed_data_location + 'DESC_MONDO_MAP.txt', sep='\t', header=None)])
location_map = location_map.merge(pd.concat([desc_anatomyCell_map,desc_mondo_map]), how='outer', on=1)[['0_y', '0_x']]
location_map['0_y'] = location_map['0_y'].fillna(location_map['0_x'])
location_map['0_x'] = location_map['0_x'].fillna(location_map['0_y'])
location_map = location_map.drop_duplicates(subset='0_y', keep='first')
disease_map = location_map.copy()

method_map = pd.read_csv(unprocessed_data_location + 'DESC_NCIT_MAP.txt', sep='\t', header=None)
desc_method_map = pd.read_csv(processed_data_location + 'DESC_NCIT_MAP.txt', sep='\t', header=None)
method_map = method_map.merge(desc_method_map, how='outer', on=1)[['0_y', '0_x']].drop_duplicates(subset='0_y')
method_map['0_y'] = method_map['0_y'].fillna(method_map['0_x'])
method_map['0_x'] = method_map['0_x'].fillna(method_map['0_y'])
method_map = method_map.drop_duplicates(subset='0_y', keep='first')

***
***
## DOWNLOAD AND PROCESS EDGE DATASETS<a class="anchor" id="create-edges"></a>
Edges are classified according to interactors' types. The relations from different sources are matched joining their dataframes after fixing identifiers.
***
***

### RNA - http://purl.obolibrary.org/obo/RO_0002203 (develops into) - RNA

* [miRBase](https://www.mirbase.org/) <br />  The miRBase database is a searchable database of published miRNA sequences and annotation. Each entry represents a predicted hairpin portion of a miRNA transcript (termed mir in the database), with information on the location and sequence of the mature miRNA sequence (termed miR).

In [None]:
data_downloader('https://www.mirbase.org/download/miRNA.dat', unprocessed_data_location)

In [None]:
embl_file = unprocessed_data_location + 'miRNA.dat'

# Create empty lists to store the data
data = {
    "ID": [],
    "Description": [],
    "Sequence": [],
    "Comments": [],
    "References": [],
    "Feature Table": []
}

# Iterate through the records in the EMBL file
for record in SeqIO.parse(embl_file, "embl"):
    data["ID"].append(record.id)
    data["Description"].append(record.description)
    data["Sequence"].append(str(record.seq))
    data["Comments"].append(str(record.annotations.get('comment', '')))
    references = []
    i = 0
    for ref in record.annotations.get('references', []):
        i = i + 1
        references.append(f"{[i], 'https://pubmed.ncbi.nlm.nih.gov/' + ref.pubmed_id}")
    data["References"].append(", ".join(references))
    feature_table = "\n".join(str(feature) for feature in record.features)
    data["Feature Table"].append(feature_table)

df = pd.DataFrame(data)
df = df[df['Description'].astype(str).str.contains('Homo sapiens')]

df['Feature Table'] = df['Feature Table'].str.split("type: miRNA")
df = df.explode('Feature Table')
df = df[df['Feature Table'] != '']
df.head(n=3)

In [None]:
df['Feature Table'] = df['Feature Table'].str.split("\n")
list(df['Feature Table'].loc[57])

In [None]:
def extract_values(row):
    result = {}
    for item in row:
        if "location: " in item:
            key_value = item.split("location: ")
            value = key_value[1]
            result['location'] = value
        elif "Key: " in item:
            key_value = item.split("Key: ")
            key = key_value[1].split(", Value:")[0].strip()
            value = key_value[1].split(", Value:")[1].strip(" ['").strip("'']")
            result[key] = value
    return pd.Series(result)

new_columns = df['Feature Table'].apply(extract_values)

df = pd.concat([df, new_columns], axis=1)

# 'accession' column contains 'product' with miRBase identifiers
df = df.drop(columns = ['product'])

df['Source'] = 'miRBase'
df.head(n=1)

Description, Sequence, Comments, References, Feature Table, location, accession, evidence, experiment columns are node properties for premiRNA and miRNA. We can remove them since they are not edge properties.

In [None]:
df = df.drop(columns=['Description', 'Sequence', 'Comments', 'References', 'Feature Table', 'location',
                      'evidence', 'experiment'])
premiRNAmiRNA = df[['ID', 'accession', 'Source']].dropna()
premiRNAmiRNA.head(n=3)

In [None]:
# Mapping for pre-miRNA is 1-to-1
all(premiRNAmiRNA['ID'].isin(rnacentral_map_human_mirbase['miRBase ID']))

In [None]:
# Mapping for mature miRNA is 1-to-1
all(premiRNAmiRNA['accession'].isin(rnacentral_map_human_mirbase['miRBase ID']))

In [None]:
premiRNAmiRNA = pd.merge(premiRNAmiRNA, rnacentral_map_human_mirbase.rename(columns={'miRBase ID':'ID'}), on='ID').drop(
    columns=['ID']).rename(columns={'RNAcentral ID':':START_ID'})
premiRNAmiRNA = pd.merge(premiRNAmiRNA, rnacentral_map_human_mirbase.rename(columns={'miRBase ID':'accession'}), on='accession').drop(
    columns=['accession']).rename(columns={'RNAcentral ID':':END_ID'})
premiRNAmiRNA = premiRNAmiRNA.fillna('nan')
premiRNAmiRNA.head(n=3)

* [tsRFun](https://rna.sysu.edu.cn/tsRFun)

In [None]:
tsRNA_map = pd.read_csv(unprocessed_data_location + 'newID_20210202.txt', sep="\t")
tsRNA_map = tsRNA_map[['tRNA','tsRNAid']]
tsRNA_map.head(n=3)
gtrnadb_rnacentral_map_human = pd.read_csv(processed_data_location + "RNAcentral_MAP/gtrnadb.tsv",sep='\t',
                                     names=['RNAcentral ID', 'DB', 'GtRNAdb ID', 'Organism', 'RNA category',"Label"])
gtrnadb_rnacentral_map_human = gtrnadb_rnacentral_map_human[gtrnadb_rnacentral_map_human['Organism']==9606]
tsRNA_RNAcentral_map = pd.merge(tsRNA_map, gtrnadb_rnacentral_map_human, left_on='tRNA', right_on='Label')[['tsRNAid','RNAcentral ID']].drop_duplicates()
tsRNA_RNAcentral_map.rename(columns={'tsRNAid':':END_ID','RNAcentral ID':':START_ID'}, inplace=True)
tsRNA_RNAcentral_map['Source'] = 'tsRFun'
tsRNA_RNAcentral_map.head(n=3)

* [tRFdb](http://genome.bioch.virginia.edu/trfdb/index.php) <br /> tRFdb is a comprehensive database of tRFs prepared from publicly available high-throughput sequencing data of >50 short RNA libraries. tRFs originate precisely from the extreme 5' (tRF-5) or 3' ends (tRF-3) of mature tRNAs or from the 3' trailer sequence of precursor tRNA transcripts (tRF-1) and are present in humans, mice, flies, worms and yeasts.

In [None]:
#http://genome.bioch.virginia.edu/trfdb/index.php --> download webpages as html

In [None]:
tRF1_tRNA = pd.read_html(unprocessed_data_location+'trf1.html')[2]
tRF1_tRNA.drop(columns=['Type','Organism','Experiment Info','Sequence','tRNA Gene Co-ordinates'],inplace=True)
tRF1_tRNA.head(n=3)

In [None]:
tRF3_tRNA = pd.read_html(unprocessed_data_location+'trf3.html')[2]
tRF3_tRNA.drop(columns=['Type','Organism','Experiment Info','Sequence','tRNA Gene Co-ordinates'],inplace=True)
tRF3_tRNA.head(n=3)

In [None]:
tRF5_tRNA = pd.read_html(unprocessed_data_location+'trf5.html')[2]
tRF5_tRNA.drop(columns=['Type','Organism','Experiment Info','Sequence','tRNA Gene Co-ordinates'],inplace=True)
tRF5_tRNA.head(n=3)

In [None]:
tRF_tRNA = pd.concat([tRF1_tRNA,tRF3_tRNA,tRF5_tRNA])
tRF_tRNA['Source'] = 'tRFdb'
tRF_tRNA.head(n=3)

In [None]:
tRF_tRNA.rename(columns={'tRF ID':'tRF','tRNA Name':'tRNA'},inplace=True)
tRF_tRNA[':END_ID'] = "trfdb?" + tRF_tRNA['tRF'].astype(str)
tRF_tRNA.drop(columns=['tRF'],inplace=True)
# map tRNA column using GtRNAdb look-up table
tRF_tRNA = pd.merge(tRF_tRNA, tRNA_map.rename(columns={0:'tRNA'}), on='tRNA').drop(columns=['tRNA']).rename(columns={1:'tRNA'})
tRF_tRNA = pd.merge(tRF_tRNA, rnacentral_map_human_gtrnadb[['RNAcentral ID','GtRNAdb Gene ID']].drop_duplicates().rename(
    columns={'GtRNAdb Gene ID':'tRNA'}), on='tRNA').drop(
        columns=['tRNA']).rename(columns={'RNAcentral ID':':START_ID'})
tRF_tRNA.head(n=3)

* [MINTbase](https://cm.jefferson.edu/MINTbase/) <br /> The Mitochondrial and Nuclear tRNA fragment database (MINTbase) is a repository of tRNA fragments (tRFs).

In [None]:
# https://cm.jefferson.edu/MINTbase/InputController?g=GRCh37&d=y&v=g&e=1.0&cl=,4,5,11,12,16,18,19,21,22,26,27,#ttop --> copy-paste table content

In [None]:
tRF_tRNA2 = pd.read_csv(unprocessed_data_location+'MINTbase.txt',sep='\t')
tRF_tRNA2['MINTbase Alternative IDs (GRCh37 assembly-derived)'] = tRF_tRNA2[
    'MINTbase Alternative IDs (GRCh37 assembly-derived)'].str.split('@').str[0]
tRF_tRNA2.rename(columns={'MINTbase Alternative IDs (GRCh37 assembly-derived)':'MINTbase tRNA name'},inplace=True)
tRF_tRNA2 = tRF_tRNA2[['License Plate (sequence derived)','MINTbase tRNA name','Expressed (# of datasets)?','Maximum RPM']].drop_duplicates()
tRF_tRNA2 = pd.merge(tRF_tRNA2, mintbase_tRNA_map.rename(columns={0:'MINTbase tRNA name'}), on='MINTbase tRNA name').drop(
    columns=['MINTbase tRNA name']).rename(columns={1:':START_ID'})

tRF_tRNA2['Expressed (# of datasets)?'] = tRF_tRNA2['Expressed (# of datasets)?'].str.replace("yes \(", "",regex=True)
tRF_tRNA2['Expressed (# of datasets)?'] = tRF_tRNA2['Expressed (# of datasets)?'].str.replace("\)", "",regex=True).astype(int)
tRF_tRNA2.rename(columns={'License Plate (sequence derived)':':END_ID'},inplace=True)
tRF_tRNA2 = tRF_tRNA2.rename(columns={'Maximum RPM':'Maximum_RPM'})
tRF_tRNA2['Source'] = 'MINTbase'
tRF_tRNA2.head(n=3)

In [None]:
RNA_develop_into_RNA = pd.concat([premiRNAmiRNA, tsRNA_RNAcentral_map, tRF_tRNA, tRF_tRNA2])
RNA_develop_into_RNA = RNA_develop_into_RNA.groupby([':START_ID',':END_ID']).agg({'Maximum_RPM': np.mean, 'Source': set}).reset_index()
RNA_develop_into_RNA[':TYPE'] = 'develops_into'
RNA_develop_into_RNA.to_pickle(unprocessed_edge_data_location+'RNA_develops_into_RNA.pkl')
RNA_develop_into_RNA.head(n=3)

In [None]:
RNA_develop_from_RNA = RNA_develop_into_RNA.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
RNA_develop_from_RNA[':TYPE'] = 'develops_from'
RNA_develop_from_RNA.to_pickle(unprocessed_edge_data_location+'RNA_develops_from_RNA.pkl')
RNA_develop_from_RNA.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_HOM0000000 (in similarity relationship with) - RNA

* [Rfam](http://rfamlive.xfam.org/) <br /> The Rfam database is a collection of RNA families, each represented by multiple sequence alignments, consensus secondary structures and covariance models.

In [None]:
! wget https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/rfam/rfam_annotations.tsv.gz -O ../resources/processed_data/unprocessed_data/rfam_annotations.tsv.gz

In [None]:
rnacentral_map_rfam_human = rnacentral_map_rfam[(rnacentral_map_rfam['Organism'] == 9606)]

rfam_annotations = pd.read_csv(unprocessed_data_location + 'rfam_annotations.tsv.gz', sep='\t',names=[
    'RNAcentral ID', "Rfam ID","Score","E-value","Sequence-Start","Sequence-Stop","Model-Start","Model-Stop",'Rfam-Model-Description'
    ]).drop(columns=["E-value","Sequence-Start","Sequence-Stop","Model-Start","Model-Stop", 'Rfam-Model-Description'])
rfam_annotations = rfam_annotations[rfam_annotations['RNAcentral ID'].isin(rnacentral_map_rfam_human['RNAcentral ID'])]
rfam_annotations.head(n=3)

In [None]:
edges = pd.merge(rfam_annotations,rfam_annotations,on='Rfam ID',suffixes=('_1', '_2'))
edges = edges[edges['RNAcentral ID_1'] != edges['RNAcentral ID_2']]
edges = edges[['RNAcentral ID_1', 'RNAcentral ID_2', 'Score_1', 'Score_2']]
edges['Score'] = (edges['Score_1'] + edges['Score_2']) / 2
rfam_sim = edges[['RNAcentral ID_1','RNAcentral ID_2','Score']]
rfam_sim = rfam_sim.fillna('nan')
rfam_sim.rename(columns={'RNAcentral ID_1':':START_ID','RNAcentral ID_2':':END_ID'},inplace=True)
#rfam_sim = rfam_sim.groupby([':START_ID',':END_ID']).agg({'Score': np.mean}).reset_index()
rfam_sim['Source'] = 'Rfam, RNAcentral'
rfam_sim.head(n=3)

* [miRBase](https://www.mirbase.org/)

In [None]:
premiRNApremiRNA = df[['ID', 'similarity', 'Source']].dropna()
premiRNApremiRNA.head(n=3)

In [None]:
# Mapping for similar miRNA is not 1-to-1
all(premiRNApremiRNA['similarity'].isin(rnacentral_map_human_mirbase['miRBase ID']))

In [None]:
len(premiRNApremiRNA) == len(premiRNApremiRNA['similarity'].isin(rnacentral_map_human_mirbase['miRBase ID']))

miRNA in the column named similarity are all belonging to the Mus Musculus species.

In [None]:
rfam_sim[':TYPE'] = 'in_similarity_relationship_with'
rfam_sim_inverse = rfam_sim.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
rfam_sim = pd.concat([rfam_sim,rfam_sim_inverse])
rfam_sim['Source'] = rfam_sim['Source'].str.split(", ")
rfam_sim = rfam_sim.explode('Source')

rfam_sim = rfam_sim.groupby([':START_ID',':END_ID']).agg({'Score': np.mean, 'Source': set}).reset_index()
rfam_sim = rfam_sim.rename(columns={'Score':'Rfam_score'})
rfam_sim.to_pickle(unprocessed_edge_data_location+'RNA_in_similarity_relationship_with_RNA.pkl')
rfam_sim.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002479 (has part that occurs in) - OBO


* [miRBase](https://www.mirbase.org/)

In [None]:
df.mod_base.unique() # GO

In [None]:
df = df[['ID', 'mod_base', 'Source']].dropna()
df.mod_base = 'GO_0006382'
df

In [None]:
# Mapping for pre-miRNA is 1-to-1
all(df['ID'].isin(rnacentral_map_human_mirbase['miRBase ID']))

In [None]:
df = pd.merge(df, rnacentral_map_human_mirbase.rename(columns={'miRBase ID':'ID'}), on='ID').drop(
    columns=['ID']).rename(columns={'RNAcentral ID':':START_ID', 'mod_base':':END_ID'})
df

In [None]:
df = df.groupby([':START_ID',':END_ID']).agg({'Source': set}).reset_index()
df[':TYPE'] = 'has_part_that_occurs_in'
df.to_pickle(unprocessed_edge_data_location+'RNA_has_part_that_occurs_in_OBO.pkl')
df.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002435 (genetically interacts with) - RNA

* [piRBase](http://bigdata.ibp.ac.cn/piRBase/)

In [None]:
# Function to retrieve the total number of piRNAs in a cluster
def get_total_pirnas(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        piRNA_info = soup.find("th", text="Number of piRNAs")
        if piRNA_info:
            total_pirnas = piRNA_info.find_next("td").text.strip()
            return int(total_pirnas)
    return 0

# Function to fetch and parse a specific page
def fetch_page(url, page_size):
    payload = {
        "pageSize": page_size,
    }
    response = requests.post(url, data=payload)
    if response.status_code == 200:
        try:
            tables = pd.read_html(response.text)
            if len(tables) > 3:
                tables[3]['Name'] = tables[3]['Name'].str.strip()
                return tables[3]
            elif len(tables) > 2:
                tables[2]['Name'] = tables[2]['Name'].str.strip()
                return tables[2]
        except ValueError as e:
            print(f"Error parsing tables: {e}")
            return None
    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return None

all_pirnas = []
for i in range(1, 231):
    url = "http://bigdata.ibp.ac.cn/piRBase/clusters.php?organism=hsa&name=clus-hsa-" + str(i)
    print(i)
    print(get_total_pirnas(url))
    combined_data = fetch_page(url=url,page_size=get_total_pirnas(url))
    all_pirnas.append(combined_data)

with open(unprocessed_data_location + "piRBase_cluster.pkl", "wb") as f:
    pickle.dump(all_pirnas, f)
with open(unprocessed_data_location + "piRBase_cluster.pkl", "rb") as f:
    all_pirnas = pickle.load(f)
all_pirnas[0].head(n=3)

In [None]:
all_new = []
for df in all_pirnas:
    df_new = pd.merge(combined_data, rnacentral_map_human_pirbase.rename(columns={'piRBase ID':'Name'}), on='Name').drop(
    columns=['Name','Length','Organism','Sequence']).rename(columns={'RNAcentral ID':'RNA'})
    all_new.append(df_new)
all_new[0].head(n=3)

All dataframes are empty. We leave the processing code below since maybe in the future golden piRNA sequences will be clustered together.

In [None]:
all_pirnas = []
for df in all_new:
    name_permutations = list(permutations(df['RNA'], 2))
    df_new = pd.DataFrame(name_permutations, columns=["RNA1", "RNA2"])
    all_pirnas.append(df_new)
df = pd.concat(all_pirnas).drop_duplicates()
df['Source'] = 'piRBase'
df.head(n=3)

* [TAM](http://www.lirmed.com/tam2/)

In [None]:
! wget http://www.lirmed.com/tam2/Public/static/data/mirset_v9.txt -O ../resources/processed_data/unprocessed_data/mirset_v9.txt

In [None]:
with open(unprocessed_data_location+'mirset_v9.txt', 'r') as file:
    data = file.read().rstrip()
    
TAM = pd.DataFrame([ ln.rstrip().split('\t') for ln in
    io.StringIO(data).readlines() ]).fillna('')

miRNA_miRNA=TAM[(TAM[0].str.contains("luster"))]
miRNA_miRNA[1] = miRNA_miRNA[1].str.lower()
miRNA_miRNA=miRNA_miRNA.dropna(axis=1, how='all')
miRNA_miRNA=miRNA_miRNA.drop(columns=[0,1])
miRNA_miRNA['merged'] = miRNA_miRNA[miRNA_miRNA.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
miRNA_miRNA=miRNA_miRNA[[2,'merged']]
miRNA_miRNA['merged'] = miRNA_miRNA.merged.str.split(',')
miRNA_miRNA = miRNA_miRNA.explode('merged')
miRNA_miRNA = miRNA_miRNA[miRNA_miRNA.merged != '']
miRNA_miRNA.head(n=3)

In [None]:
# Mapping for miRNA is 1-to-1
all(miRNA_miRNA[2].isin(rnacentral_map_human['DB Description']))

In [None]:
# These miRNAs are all premiRNAs
rnacentral_map_human[rnacentral_map_human['DB Description'].isin(miRNA_miRNA[2])]['RNA category'].unique()

In [None]:
# Mapping for interacting miRNA is 1-to-1
all(miRNA_miRNA['merged'].isin(rnacentral_map_human['DB Description']))

In [None]:
# Interacting miRNAs are all premiRNAs
rnacentral_map_human[rnacentral_map_human['DB Description'].isin(miRNA_miRNA['merged'])]['RNA category'].unique()

In [None]:
miRNA_miRNA = pd.merge(miRNA_miRNA, rnacentral_map_human.rename(columns={'DB Description':2}), on=2).drop(
    columns=[2]).rename(columns={'RNAcentral ID':'RNA1'})
miRNA_miRNA = pd.merge(miRNA_miRNA, rnacentral_map_human.rename(columns={'DB Description':'merged'}), on='merged').drop(
    columns=['merged']).rename(columns={'RNAcentral ID':'RNA2'})
miRNA_miRNA = miRNA_miRNA[['RNA1', 'RNA2']]

miRNA_miRNA = miRNA_miRNA.apply(lambda x: sorted(x), axis=1, result_type='expand').rename(columns={0:':START_ID',1:':END_ID'})
#miRNA_miRNA['Number_of_experiments'] = (miRNA_miRNA.groupby(miRNA_miRNA.columns.tolist()).transform('size'))
#miRNA_miRNA = miRNA_miRNA[miRNA_miRNA['Number_of_experiments'].notna()].drop_duplicates()

miRNA_miRNA['Source'] = 'TAM'
miRNA_miRNA.head(n=3)

In [None]:
miRNA_miRNA[':TYPE'] = 'genetically_interacts_with'
miRNA_miRNA_inv = miRNA_miRNA.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
miRNA_miRNA = pd.concat([miRNA_miRNA,miRNA_miRNA_inv])
miRNA_miRNA = miRNA_miRNA.groupby([':START_ID',':END_ID']).agg({'Source': set}).reset_index()
miRNA_miRNA.to_pickle(unprocessed_edge_data_location+'RNA_genetically_interacts_with_RNA.pkl')
miRNA_miRNA.head(n=3)

***
### Gene - http://purl.obolibrary.org/obo/RO_0002435 (genetically interacts with) - Gene

* [GeneMANIA](http://genemania.org/) <br/>
GeneMANIA provides genes that are related, using a very large set of functional association data. Association data include protein and genetic interactions, pathways, co-expression, co-localization and protein domain similarity.

In [None]:
data_downloader('http://genemania.org/data/current/Homo_sapiens.COMBINED/COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt', unprocessed_data_location)

In [None]:
gene_gene = pd.read_csv(unprocessed_data_location + "COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt", sep='\t')
gene_gene = pd.merge(gene_gene, ensembl_entrezGene_map.rename(columns={0:'Gene_A'}), on='Gene_A')
gene_gene = pd.merge(gene_gene, ensembl_entrezGene_map.rename(columns={0:'Gene_B'}), on='Gene_B')
gene_gene[['1_x','1_y','Weight']].head(n=3)

In [None]:
gene_gene['Source'] = 'GeneMANIA' 
gene_gene = gene_gene.rename(columns={'1_x':':START_ID','1_y':':END_ID'})
gene_gene[':TYPE'] = 'genetically_interacts_with'
gene_gene_inv = gene_gene.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
gene_gene = pd.concat([gene_gene,gene_gene_inv])
gene_gene = gene_gene.groupby([':START_ID',':END_ID']).agg({'Source': set, 'Weight': np.mean}).reset_index()
gene_gene = gene_gene.rename(columns={'Weight':'GeneMANIA_weight'})
gene_gene.to_pickle(unprocessed_edge_data_location+'gene_genetically_interacts_with_gene.pkl')
gene_gene.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_HOM0000001 (in homology relationship with) - RNA


* [TAM](http://www.lirmed.com/tam2/)

In [None]:
! wget http://www.lirmed.com/tam2/Public/static/data/mirset_v9.txt -O ../resources/processed_data/unprocessed_data/mirset_v9.txt

In [None]:
#with open(unprocessed_data_location+'mirset_v9.txt', 'r') as file:
#    data = file.read().rstrip()
    
#TAM = pd.DataFrame([ ln.rstrip().split('\t') for ln in
#    io.StringIO(data).readlines() ]).fillna('')

miRNA_miRNA=TAM[TAM[0].str.contains("amily")]
miRNA_miRNA[1] = miRNA_miRNA[1].str.lower()
miRNA_miRNA=miRNA_miRNA.dropna(axis=1, how='all')
miRNA_miRNA=miRNA_miRNA.drop(columns=[0,1])
miRNA_miRNA['merged'] = miRNA_miRNA[miRNA_miRNA.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
miRNA_miRNA=miRNA_miRNA[[2,'merged']]

miRNA_miRNA['merged'] = miRNA_miRNA.merged.str.split(',')
miRNA_miRNA = miRNA_miRNA.explode('merged')
miRNA_miRNA = miRNA_miRNA[miRNA_miRNA.merged != '']

# Mapping for miRNA is 1-to-1
print(all(miRNA_miRNA[2].isin(rnacentral_map_human['DB Description'])))
# These miRNAs are all premiRNAs
print(rnacentral_map_human[rnacentral_map_human['DB Description'].isin(miRNA_miRNA[2])]['RNA category'].unique())
# Mapping for interacting miRNA is 1-to-1
print(all(miRNA_miRNA['merged'].isin(rnacentral_map_human['DB Description'])))
# These miRNAs are all premiRNAs
print(rnacentral_map_human[rnacentral_map_human['DB Description'].isin(miRNA_miRNA['merged'])]['RNA category'].unique())

In [None]:
# According to miRBase, these miRNA labels belong all to dead hairpin entries
miRNA_miRNA[~miRNA_miRNA[2].isin(rnacentral_map_human['DB Description'])].head(n=3)

In [None]:
miRNA_miRNA = pd.merge(miRNA_miRNA, rnacentral_map_human.rename(columns={'DB Description':2}), on=2).drop(
    columns=[2]).rename(columns={'RNAcentral ID':'RNA1'})
miRNA_miRNA = pd.merge(miRNA_miRNA, rnacentral_map_human.rename(columns={'DB Description':'merged'}), on='merged').drop(
    columns=['merged']).rename(columns={'RNAcentral ID':'RNA2'})
miRNA_miRNA = miRNA_miRNA[['RNA1', 'RNA2']]

miRNA_miRNA = miRNA_miRNA.apply(lambda x: sorted(x), axis=1, result_type='expand').rename(columns={0:'RNA1',1:'RNA2'})
#miRNA_miRNA['Number_of_experiments'] = (miRNA_miRNA.groupby(miRNA_miRNA.columns.tolist()).transform('size'))
#miRNA_miRNA = miRNA_miRNA[miRNA_miRNA['Number_of_experiments'].notna()].drop_duplicates()

miRNA_miRNA['Source'] = 'TAM'
miRNA_miRNA = miRNA_miRNA.rename(columns={'RNA1':':START_ID','RNA2':':END_ID'})
miRNA_miRNA.head(n=3)

In [None]:
miRNA_miRNA[':TYPE'] = 'in_homology_relationship_with'
miRNA_miRNA_inv = miRNA_miRNA.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
miRNA_miRNA = pd.concat([miRNA_miRNA,miRNA_miRNA_inv])
miRNA_miRNA = miRNA_miRNA.groupby([':START_ID',':END_ID']).agg({'Source': set}).reset_index()
miRNA_miRNA.to_pickle(unprocessed_edge_data_location+'RNA_in_homology_relationship_with_RNA.pkl')
miRNA_miRNA.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002450 (directly positively regulates activity of) - RNA

* [TarBase](https://dianalab.e-ce.uth.gr/tarbasev9) <br />  DIANA-TarBase v8 is a reference database devoted to the indexing of experimentally supported microRNA (miRNA) targets on protein-coding transcripts.

In [None]:
data_downloader('https://dianalab.e-ce.uth.gr/tarbasev9/data/Homo_sapiens_TarBase-v9.tsv.gz', unprocessed_data_location)

In [None]:
miRNA_RNA = pd.read_csv(unprocessed_data_location + 'Homo_sapiens_TarBase-v9.tsv', sep="\t", dtype={'article_pubmed_id':'string'})[[
    'mirna_id', 'transcript_id', 'experimental_method', 'regulation', 'tissue', 'cell_line', 'article_pubmed_id', 'microt_score']]  

# microT score >=0.7 is a reliable threshold according to TarBase
miRNA_RNA = miRNA_RNA[miRNA_RNA['microt_score'] >= 0.7]
miRNA_RNA['cell_line'] = miRNA_RNA['cell_line'].fillna(miRNA_RNA['tissue'])
miRNA_RNA.drop(columns=['tissue'], inplace=True)
miRNA_RNA['cell_line'] = miRNA_RNA['cell_line'].str.lower()
miRNA_RNA['cell_line'] = miRNA_RNA['cell_line'].str.replace("-","").str.replace(
    "_"," ").str.replace("  "," ").str.replace(' cell', '').str.replace(' cells', '')
miRNA_RNA['experimental_method'] = miRNA_RNA['experimental_method'].str.lower()
miRNA_RNA['article_pubmed_id'] = pd.to_numeric(miRNA_RNA['article_pubmed_id'], errors='coerce')
miRNA_RNA['article_pubmed_id'] = miRNA_RNA['article_pubmed_id'].replace("<NA>", np.nan)
miRNA_RNA = pd.merge(miRNA_RNA, location_map, right_on='0_y', left_on='cell_line', how='left')
miRNA_RNA['0_x'] = miRNA_RNA['0_x'].fillna(miRNA_RNA['cell_line'])
miRNA_RNA = miRNA_RNA.drop(columns=['0_y', 'cell_line'])
miRNA_RNA = miRNA_RNA.rename(columns={'0_x':'Location'})

miRNA_RNA = pd.merge(miRNA_RNA, method_map, right_on='0_y', left_on='experimental_method', how='left')
miRNA_RNA['0_x'] = miRNA_RNA['0_x'].fillna(miRNA_RNA['experimental_method'])
miRNA_RNA = miRNA_RNA.drop(columns=['0_y', 'experimental_method'])
miRNA_RNA = miRNA_RNA.rename(columns={'0_x':'Method','article_pubmed_id':'PubMedID'})

miRNA_RNA.head(n=3)

In [None]:
# miRNAs are all 1-to-1 mapped
all(miRNA_RNA['mirna_id'].isin(rnacentral_map_human_mirbase['miRBase ID']))

In [None]:
miRNA_RNA = pd.merge(miRNA_RNA.rename(columns={'mirna_id':'miRBase ID'}),
                     rnacentral_map_human_mirbase, on='miRBase ID').drop(
                         columns=['miRBase ID']).rename(columns={'RNAcentral ID':'RNA1'})

In [None]:
# Ensembl IDs not present in our look-up tables are all belonging to retired or novel transcripts
miRNA_RNA[~miRNA_RNA['transcript_id'].isin(ensembl_entrezTranscript_map[1])]['transcript_id'].unique()[:3]

In [None]:
miRNA_RNA_rnacentral = pd.merge(miRNA_RNA, rnacentral_map_human_ensembl.rename(columns={
    'Ensembl transcript ID':'transcript_id'}), on='transcript_id').drop(columns=['Ensembl Gene ID','transcript_id'])
miRNA_RNA_rnacentral = miRNA_RNA_rnacentral.rename(columns={'RNAcentral ID':'RNA2'})

miRNA_RNA_rnacentral = miRNA_RNA_rnacentral.groupby(
    ['RNA1', 'RNA2', 'Method', 'regulation','Location', 'PubMedID']
).agg({'microt_score': list}).reset_index()
#miRNA_RNA_rnacentral["Number_of_experiments"] = miRNA_RNA_rnacentral["microt_score"].apply(len)
miRNA_RNA_rnacentral["microT_score"] = miRNA_RNA_rnacentral["microt_score"].apply(np.mean)
miRNA_RNA_rnacentral = miRNA_RNA_rnacentral.drop(columns=["microt_score"])

miRNA_RNA_rnacentral['Source'] = 'TarBase, microT, miRNet'
miRNA_RNA_rnacentral.head(n=3)

In [None]:
miRNA_RNA_ensembl = pd.merge(miRNA_RNA, ensembl_entrezTranscript_map.rename(
    columns={1:'transcript_id'}), on='transcript_id').drop(columns=[0,2,3,4,5])
miRNA_RNA_ensembl = miRNA_RNA_ensembl[~miRNA_RNA_ensembl['transcript_id'].isin(rnacentral_map_human_ensembl['Ensembl transcript ID'])]
miRNA_RNA_ensembl = miRNA_RNA_ensembl.rename(columns={'transcript_id':'RNA2'})

miRNA_RNA_ensembl = miRNA_RNA_ensembl.groupby(
    ['RNA1', 'RNA2', 'Method', 'regulation', 'Location', 'PubMedID']
).agg({'microt_score': list}).reset_index()
#miRNA_RNA_ensembl["Number_of_experiments"] = miRNA_RNA_ensembl["microt_score"].apply(len)
miRNA_RNA_ensembl["microT_score"] = miRNA_RNA_ensembl["microt_score"].apply(np.mean)
miRNA_RNA_ensembl = miRNA_RNA_ensembl.drop(columns=["microt_score"])

miRNA_RNA_ensembl['Source'] = 'TarBase, microT, miRNet'
miRNA_RNA_ensembl.head(n=3)

In [None]:
miRNA_RNA_rnacentral_pos = miRNA_RNA_rnacentral[miRNA_RNA_rnacentral['regulation'] == "Positive"].drop(columns=['regulation'])
miRNA_RNA_rnacentral_pos.head(n=3) # Empty

In [None]:
miRNA_RNA_ensembl_pos = miRNA_RNA_ensembl[miRNA_RNA_ensembl['regulation'] == "Positive"].drop(columns=['regulation'])
miRNA_RNA_ensembl_pos.head(n=3)

In [None]:
miRNA_RNA_pos = pd.concat([miRNA_RNA_rnacentral_pos, miRNA_RNA_ensembl_pos]).rename(columns={'RNA1':':START_ID','RNA2':':END_ID'})

miRNA_RNA_pos['Source'] = miRNA_RNA_pos['Source'].str.split(", ")
miRNA_RNA_pos = miRNA_RNA_pos.explode('Source')
miRNA_RNA_pos = miRNA_RNA_pos.groupby([':START_ID',':END_ID']).agg({'Method': set, 'Location': set, 'PubMedID': set,
                                                                    'microT_score':np.mean, 'Source': set}).reset_index()

miRNA_RNA_pos[':TYPE'] = 'directly_positively_regulates_activity_of'
miRNA_RNA_pos.to_pickle(unprocessed_edge_data_location+'RNA_directly_positively_regulates_activity_of_RNA.pkl')
miRNA_RNA_pos.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002449 (directly negatively regulates activity of) - RNA

* [TarBase](https://dianalab.e-ce.uth.gr/tarbasev9)

In [None]:
miRNA_RNA_rnacentral_neg = miRNA_RNA_rnacentral[miRNA_RNA_rnacentral['regulation'] == "Negative"].drop(columns=['regulation'])
miRNA_RNA_rnacentral_neg.head(n=3)

In [None]:
miRNA_RNA_ensembl_neg = miRNA_RNA_ensembl[miRNA_RNA_ensembl['regulation'] == "Negative"].drop(columns=['regulation'])
miRNA_RNA_ensembl_neg.head(n=3)

In [None]:
miRNA_RNA_neg = pd.concat([miRNA_RNA_rnacentral_neg, miRNA_RNA_ensembl_neg]).rename(columns={'RNA1':':START_ID','RNA2':':END_ID'})

miRNA_RNA_neg['Source'] = miRNA_RNA_neg['Source'].str.split(", ")
miRNA_RNA_neg = miRNA_RNA_neg.explode('Source')
miRNA_RNA_neg = miRNA_RNA_neg.groupby([':START_ID',':END_ID']).agg({'Method': set, 'Location': set, 'PubMedID': set,
                                                                    'microT_score':np.mean, 'Source': set}).reset_index()

miRNA_RNA_neg[':TYPE'] = 'directly_negatively_regulates_activity_of'
miRNA_RNA_neg.to_pickle(unprocessed_edge_data_location+'RNA_directly_negatively_regulates_activity_of_RNA.pkl')
miRNA_RNA_neg.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002448 (directly regulates activity of) - RNA


* [TargetScan](https://www.targetscan.org/vert_80/) <br /> TargetScan predicts biological targets of miRNAs by searching for the presence of conserved 8mer, 7mer, and 6mer sites that match the seed region of each miRNA. 

In [None]:
!wget https://www.targetscan.org/vert_80/vert_80_data_download/Predicted_Targets_Context_Scores.default_predictions.txt.zip -O ../resources/processed_data/unprocessed_data/Predicted_Targets_Context_Scores.default_predictions.txt.zip

In [None]:
miRNA_RNA = pd.read_csv(unprocessed_data_location+'Predicted_Targets_Context_Scores.default_predictions.txt.zip', sep='\t')
miRNA_RNA = miRNA_RNA[miRNA_RNA['Gene Tax ID'] == 9606]

# From TargetScan: The context++ score (CS) for a specific site is the sum of the contribution of 14 features (Agarwal et al., 2015)
# including site type, supplementary pairing, local AU, minimum distance, and PCT (probability of conserved targeting)
miRNA_RNA.drop(columns=['Gene ID','Gene Symbol','Gene Tax ID', 'Site Type', 'UTR_start', 'UTR end', 'context++ score',
                         'context++ score percentile', 'weighted context++ score percentile', 'Predicted relative KD'], inplace=True)

# Targets with lowest context+ scores are the most representative ones
print(miRNA_RNA['weighted context++ score'].min())

miRNA_RNA['Transcript ID'] = miRNA_RNA['Transcript ID'].str.split('.').str[0]
miRNA_RNA['miRNA'] = miRNA_RNA['miRNA'].str.split('.').str[0]

print(miRNA_RNA[~miRNA_RNA['Transcript ID'].str.startswith('ENST')]['Transcript ID'].unique())

# CDR1as is NCBI entrez 286411
miRNA_RNA = miRNA_RNA[~((miRNA_RNA['Transcript ID'] == 'CDR1as') & miRNA_RNA.duplicated())]
cdr1as_transcripts = ', '.join(list(ensembl_entrezTranscript_map[ensembl_entrezTranscript_map[0] == 286411][1]))
miRNA_RNA.loc[miRNA_RNA['Transcript ID'] == 'CDR1as', 'Transcript ID'] = cdr1as_transcripts
miRNA_RNA['Transcript ID'] = miRNA_RNA['Transcript ID'].str.split(', ')
miRNA_RNA = miRNA_RNA.explode('Transcript ID')

print(all(miRNA_RNA['miRNA'].isin(rnacentral_map_human['DB Description'])))
miRNA_RNA_miRNAnotInRNAcentral3p = miRNA_RNA[~miRNA_RNA['miRNA'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = miRNA_RNA[~miRNA_RNA['miRNA'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['miRNA'] = miRNA_RNA_miRNAnotInRNAcentral5p['miRNA'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['miRNA'] = miRNA_RNA_miRNAnotInRNAcentral3p['miRNA'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'miRNA'}), on='miRNA').drop(columns=['miRNA']).rename(columns={'RNAcentral ID':'RNA1'})

miRNA_RNA = pd.merge(miRNA_RNA, rnacentral_map_human.rename(columns={'DB Description':'miRNA'}), on='miRNA')
miRNA_RNA = pd.concat([miRNA_RNA.rename(columns={'RNAcentral ID':'RNA1'}),
                       miRNA_RNA_miRNAnotInRNAcentral]).drop(columns=['DB', 'Organism', 'RNA category'])
miRNA_RNA = miRNA_RNA.fillna('nan')
miRNA_RNA = miRNA_RNA.groupby(['Transcript ID', 'RNA1']).agg({'weighted context++ score': list}).reset_index()
#miRNA_RNA['Number_of_experiments_from_TargetScan'] = miRNA_RNA['weighted context++ score'].apply(len)
miRNA_RNA['Weighted_CS_score'] = miRNA_RNA['weighted context++ score'].apply(np.mean)
miRNA_RNA = miRNA_RNA.drop(columns=['weighted context++ score'])

miRNA_RNA['Source'] = 'TargetScan'

miRNA_RNAcentral = pd.merge(miRNA_RNA, rnacentral_map_human_ensembl.rename(columns={
    'Ensembl transcript ID': 'Transcript ID'})).drop(columns=['Ensembl Gene ID']).rename(
        columns={'RNAcentral ID':'RNA2'})

miRNA_RNA_ensembl = miRNA_RNA[~miRNA_RNA['Transcript ID'].isin(miRNA_RNAcentral['Transcript ID'])]
miRNA_RNA_ensembl = miRNA_RNA_ensembl.rename(columns={'Transcript ID':'RNA2'})

miRNA_RNAcentral = miRNA_RNAcentral.drop(columns=['Transcript ID'])

print(miRNA_RNAcentral.head(n=3))
print(miRNA_RNA_ensembl.head(n=3))

RNA_directly_regulates_activity_of_RNA_1 = pd.concat([miRNA_RNAcentral, miRNA_RNA_ensembl])
RNA_directly_regulates_activity_of_RNA_1.rename(columns={'RNA1':':START_ID','RNA2':':END_ID'},inplace=True)
RNA_directly_regulates_activity_of_RNA_1.head(n=3)

* [miRDB](https://mirdb.org/index.html) <br />  miRDB is an online database for miRNA target prediction and functional annotations. All the targets in miRDB were predicted by a bioinformatics tool, MirTarget, which was developed by analyzing thousands of miRNA-target interactions from high-throughput sequencing experiments.

In [None]:
data_downloader('https://mirdb.org/download/miRDB_v6.0_prediction_result.txt.gz', unprocessed_data_location)

In [None]:
miRNA_mRNA2 = pd.read_csv(unprocessed_data_location+'miRDB_v6.0_prediction_result.txt', sep='\t', names=['RNA1', 'RNA2', 'score'])
# From miRDB:
# All the predicted targets have target prediction scores between 50 - 100.
# These scores are assigned by the new computational target prediction algorithm.
# The higher the score, the more confidence we have in this prediction.
# That is why the search result is ordered by prediction score.
# In our experience, a predicted target with prediction score > 80 is most likely to be real.
# If the score is below 60, you need to be cautious and it is recommended to have other supporting evidence as well.  
miRNA_mRNA2 = miRNA_mRNA2[miRNA_mRNA2['score'] > 80]

print(all(miRNA_mRNA2['RNA1'].isin(rnacentral_map_human['DB Description'])))
# Sequences not in RNAcentral are all belonging to non-human miRNAs
print(miRNA_mRNA2[~miRNA_mRNA2['RNA1'].isin(rnacentral_map_human['DB Description'])]['RNA1'].str[:3].unique())
miRNA_mRNA2 = pd.merge(miRNA_mRNA2, rnacentral_map_human.rename(columns={'DB Description':'RNA1'}), on='RNA1').drop(
    columns=['RNA1']).rename(columns={'RNAcentral ID':'RNA1'})

miRNA_mRNA2 = miRNA_mRNA2.fillna('nan')
miRNA_mRNA2 = miRNA_mRNA2.groupby(['RNA1', 'RNA2']).agg({'score': list}).reset_index()
#miRNA_mRNA2['Number_of_experiments'] = miRNA_mRNA2['score'].apply(len)
miRNA_mRNA2['miRDB_score'] = miRNA_mRNA2['score'].apply(np.mean)
miRNA_mRNA2 = miRNA_mRNA2.drop(columns=['score'])
miRNA_mRNA2['Source'] = 'miRDB'

RNA_directly_regulates_activity_of_RNA_2 = pd.merge(miRNA_mRNA2, rnacentral_map_human_refseq[['RNAcentral ID','RefSeq ID']],
                                                    left_on='RNA2', right_on='RefSeq ID').drop(columns=['RefSeq ID', 'RNA2'])

RNA_directly_regulates_activity_of_RNA_2.rename(columns={'RNA1':':START_ID','RNAcentral ID':':END_ID'},inplace=True)
RNA_directly_regulates_activity_of_RNA_2.head(n=3)

* [miRecords](http://c1.accurascience.com/miRecords/download_data.php?v=4) <br />  miRecords is a resource for animal miRNA-target interactions.

In [None]:
!wget http://c1.accurascience.com/miRecords/download_data.php?v=4 -O ../resources/processed_data/unprocessed_data/download_data.php?v=4.csv

In [None]:
miRNA_mRNA2 = pd.read_excel(unprocessed_data_location+"download_data.php?v=4.csv")

# We keep only Homo sapiens rows
miRNA_mRNA2 = miRNA_mRNA2[miRNA_mRNA2['Target gene_species_scientific'].str.contains("apiens")]
miRNA_mRNA2 = miRNA_mRNA2[miRNA_mRNA2['miRNA_species'].str.contains("apiens")]
miRNA_mRNA2['Target gene_Refseq_acc']= miRNA_mRNA2['Target gene_Refseq_acc'].str.split(".").str[0]
miRNA_mRNA2.rename(columns={'Target gene_Refseq_acc': 'RNA2', 'miRNA_mature_ID': 'RNA1'}, inplace=True)
miRNA_mRNA2.drop(columns=['Target gene_species_scientific','Target gene_name','miRNA_species'], inplace=True)
miRNA_mRNA2['RNA1'] = miRNA_mRNA2['RNA1'].str.replace("[","")
miRNA_mRNA2['RNA1'] = miRNA_mRNA2['RNA1'].str.replace("]","")

print(all(miRNA_mRNA2['RNA1'].isin(rnacentral_map_human['DB Description'])))
print(miRNA_mRNA2[~miRNA_mRNA2['RNA1'].isin(rnacentral_map_human['DB Description'])]['RNA1'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = miRNA_mRNA2[~miRNA_mRNA2['RNA1'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = miRNA_mRNA2[~miRNA_mRNA2['RNA1'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['RNA1'] = miRNA_RNA_miRNAnotInRNAcentral5p['RNA1'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['RNA1'] = miRNA_RNA_miRNAnotInRNAcentral3p['RNA1'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentralhsa = miRNA_RNA_miRNAnotInRNAcentral.copy()
miRNA_RNA_miRNAnotInRNAcentralhsa['RNA1'] = "hsa-" + miRNA_RNA_miRNAnotInRNAcentralhsa['RNA1']
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentralhsa, miRNA_RNA_miRNAnotInRNAcentral3p,
                                            miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'RNA1'}), on='RNA1').drop(columns=['RNA1']).rename(columns={'RNAcentral ID':'RNA1'})
miRNA_mRNA2 = pd.merge(miRNA_mRNA2, rnacentral_map_human.rename(columns={'DB Description':'RNA1'}), on='RNA1').drop(
    columns=['RNA1']).rename(columns={'RNAcentral ID':'RNA1'})
miRNA_mRNA2 = pd.concat([miRNA_mRNA2, miRNA_RNA_miRNAnotInRNAcentral])

miRNA_mRNA2 = miRNA_mRNA2.drop(columns=['Target site_number','miRNA_regulation','Reporter_target gene/region','Mutation_target region',
                                        'Reporter link element','Target gene mRNA_level','Original description',
                                        'Post mutation_method',	'Original description_mutation_region', 'Target site_position',
                                        'miRNA_regulation_site','Reporter_target site','Reporter link element.1','Test_method_inter_site',
                                        'Original description_inter_site','Mutation_target site','Post mutation_method_site',
                                        'Original description_mutation_site','Additional note','DB','DB ID','Organism','RNA category'])

miRNA_mRNA2 = pd.merge(miRNA_mRNA2, rnacentral_map_human_refseq[['RNAcentral ID','RefSeq ID']].rename(
    columns={'RefSeq ID':'RNA2'}), on='RNA2').drop(columns=['RNA2']).rename(columns={'RNAcentral ID':'RNA2'})

miRNA_mRNA2['Source'] = 'miRecords, miRNet'

miRNA_mRNA2['Pubmed_id'] = pd.to_numeric(miRNA_mRNA2['Pubmed_id'], errors='coerce')
miRNA_mRNA2['Pubmed_id'] = miRNA_mRNA2['Pubmed_id'].replace(0, np.nan)
miRNA_mRNA2['Pubmed_id'] = miRNA_mRNA2['Pubmed_id'].replace("<NA>", np.nan)
miRNA_mRNA2['Pubmed_id'] = pd.to_numeric(miRNA_mRNA2['Pubmed_id'], errors='coerce')
miRNA_mRNA2['Pubmed_id'] = miRNA_mRNA2['Pubmed_id'].replace("<NA>", np.nan)

miRNA_mRNA2['Test_method_inter'] = miRNA_mRNA2['Test_method_inter'].str.lower()
miRNA_mRNA2['Test_method_inter'] = miRNA_mRNA2['Test_method_inter'].str.replace("\}\{",", ", regex=True)
miRNA_mRNA2['Test_method_inter'] = miRNA_mRNA2['Test_method_inter'].str.replace("\{","", regex=True)
miRNA_mRNA2['Test_method_inter'] = miRNA_mRNA2['Test_method_inter'].str.replace("\}","", regex=True)
miRNA_mRNA2['Test_method_inter'] = miRNA_mRNA2['Test_method_inter'].replace("n/a", np.nan)
miRNA_mRNA2['Test_method_inter'] = miRNA_mRNA2['Test_method_inter'].str.split(", ")
miRNA_mRNA2 = miRNA_mRNA2.explode('Test_method_inter')
miRNA_mRNA2 = pd.merge(miRNA_mRNA2, method_map, right_on='0_y', left_on='Test_method_inter', how='left')
miRNA_mRNA2['0_x'] = miRNA_mRNA2['0_x'].fillna(miRNA_mRNA2['Test_method_inter'])
miRNA_mRNA2 = miRNA_mRNA2.drop(columns=['0_y', 'Test_method_inter'])
miRNA_mRNA2 = miRNA_mRNA2.rename(columns={'0_x':'Method','Pubmed_id':'PubMedID'})

RNA_directly_regulates_activity_of_RNA_3 = miRNA_mRNA2.copy()
RNA_directly_regulates_activity_of_RNA_3.rename(columns={'RNA1':':START_ID','RNA2':':END_ID'},inplace=True)
RNA_directly_regulates_activity_of_RNA_3.head(n=3)

* [SomamiR](https://compbio.uthsc.edu/SomamiR/) <br /> SomamiR is a database of cancer somatic mutations in microRNAs (miRNA) and their target sites that potentially alter the interactions between miRNAs and competing endogenous RNAs (ceRNA) including mRNAs, circular RNAs (circRNA) and long noncoding RNAs (lncRNA).

In [None]:
!wget https://compbio.uthsc.edu/SomamiR/download/predicted_mRNA_targets_somamir_v2.0.txt.tar.gz -O ../resources/processed_data/unprocessed_data/predicted_mRNA_targets_somamir_v2.0.txt.tar.gz

In [None]:
mRNA_miRNA2 = pd.read_csv(unprocessed_data_location + 'predicted_mRNA_targets_somamir_v2.0.txt.tar.gz', sep='\t')
# We select only relationships validated by TargetScan
mRNA_miRNA2 = mRNA_miRNA2[mRNA_miRNA2['Targetscan'] == 1]

mRNA_miRNA2.drop(columns=[
    'Genesymbol', 'Targetscan', 'Chromosome', 'strand', 'Mutationlocation', 'Mutationid', 'Sample_id',
    'WTallele', 'Mutantallele', 'Organisms', 'Targetsiteclass', 'Seed_mod', 
    'mRNAseq', 'miRseedseq', 'Seedclass', 'WTconservation', 'wildtype_csp', 
    'mutant_csp', 'display_first', 'pita_ref', 'pita_mut', 'pita_diff'
], inplace=True)

print(all(mRNA_miRNA2['miRNA'].isin(rnacentral_map_human['DB Description'])))
print(mRNA_miRNA2[~mRNA_miRNA2['miRNA'].isin(rnacentral_map_human['DB Description'])]['miRNA'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = mRNA_miRNA2[~mRNA_miRNA2['miRNA'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = mRNA_miRNA2[~mRNA_miRNA2['miRNA'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['miRNA'] = miRNA_RNA_miRNAnotInRNAcentral5p['miRNA'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['miRNA'] = miRNA_RNA_miRNAnotInRNAcentral3p['miRNA'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'miRNA'}), on='miRNA').drop(columns=['miRNA']).rename(columns={'RNAcentral ID':'miRNA'})
mRNA_miRNA2 = pd.merge(mRNA_miRNA2, rnacentral_map_human.rename(columns={'DB Description':'miRNA'}), on='miRNA').drop(
    columns=['miRNA']).rename(columns={'RNAcentral ID':'miRNA'})
mRNA_miRNA2 = pd.concat([mRNA_miRNA2, miRNA_RNA_miRNAnotInRNAcentral]).rename(columns={'miRNA':'RNA1','Refseq':'RNA2'})

mRNA_miRNA2['Pubmedid'] = pd.to_numeric(mRNA_miRNA2['Pubmedid'], errors='coerce')
mRNA_miRNA2['Pubmedid'] = mRNA_miRNA2['Pubmedid'].astype(str)
mRNA_miRNA2['Pubmedid'] = mRNA_miRNA2['Pubmedid'].str.replace(".0", "")
mRNA_miRNA2['Pubmedid'] = mRNA_miRNA2['Pubmedid'].replace("<NA>", np.nan)

mRNA_miRNA2['Cancertype'] = mRNA_miRNA2['Cancertype'].str.lower()
mRNA_miRNA2['Cancertype'] = mRNA_miRNA2['Cancertype'].str.replace('[ns]','')
import re

mRNA_miRNA2['Cancertype'] = mRNA_miRNA2['Cancertype'].apply(
    lambda x: ' '.join(re.findall(r'\[(.*?)\]', x)) if '[' in x and ']' in x else x
).str.replace('_', ' ')
mRNA_miRNA2 = pd.merge(mRNA_miRNA2, disease_map, right_on='0_y', left_on='Cancertype', how='left')
mRNA_miRNA2['0_x'] = mRNA_miRNA2['0_x'].fillna(mRNA_miRNA2['Cancertype'])
mRNA_miRNA2['0_x'] = mRNA_miRNA2['0_x'].replace('',np.nan)
mRNA_miRNA2 = mRNA_miRNA2.drop(columns=['0_y', 'Cancertype'])
mRNA_miRNA2 = mRNA_miRNA2.rename(columns={'0_x':'Location'})

mRNA_miRNA2['Source'] = 'SomamiR, TargetScan'

mRNA_miRNA2 = pd.merge(mRNA_miRNA2, rnacentral_map_human_refseq.rename(
    columns={"RefSeq ID":"RNA2"}), on="RNA2").drop(columns=['Label','RNA2','DB','DB ID','Organism','RNA category']).rename(columns={"RNAcentral ID":"RNA2"})

RNA_directly_regulates_activity_of_RNA_4 = mRNA_miRNA2.copy()
RNA_directly_regulates_activity_of_RNA_4.rename(columns={'RNA1':':START_ID','RNA2':':END_ID','Pubmedid':'PubMedID','U_mut_id':'Mutation'},inplace=True)
RNA_directly_regulates_activity_of_RNA_4.head(n=3)

* [miRdSNP](http://mirdsnp.ccr.buffalo.edu/index.php) <br /> miRdSNP is a database of disease-associated SNPs and microRNA target sites on 3'UTRs of human genes.

In [None]:
!wget http://mirdsnp.ccr.buffalo.edu/downloads/mirdsnp-dsnp-generated-mir-targets-v11.03.csv -O ../resources/processed_data/unprocessed_data/mirdsnp-dsnp-generated-mir-targets-v11.03.csv

In [None]:
mRNA_miRNA2 = pd.read_csv(unprocessed_data_location+'mirdsnp-dsnp-generated-mir-targets-v11.03.csv')
#mRNA_miRNA2 = mRNA_miRNA2[mRNA_miRNA2['experimentally_confirmed']=='Yes']
mRNA_miRNA2.drop(columns=['experimentally_confirmed', 'gene_name', 'distance'],inplace=True)

print(all(mRNA_miRNA2['miR'].isin(rnacentral_map_human['DB Description'])))
print(mRNA_miRNA2[~mRNA_miRNA2['miR'].isin(rnacentral_map_human['DB Description'])]['miR'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = mRNA_miRNA2[~mRNA_miRNA2['miR'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = mRNA_miRNA2[~mRNA_miRNA2['miR'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['miR'] = miRNA_RNA_miRNAnotInRNAcentral5p['miR'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['miR'] = miRNA_RNA_miRNAnotInRNAcentral3p['miR'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'miR'}), on='miR').drop(columns=['miR']).rename(columns={'RNAcentral ID':'miR'})
mRNA_miRNA2 = pd.merge(mRNA_miRNA2, rnacentral_map_human.rename(columns={'DB Description':'miR'}), on='miR').drop(
    columns=['miR']).rename(columns={'RNAcentral ID':'miR'})
mRNA_miRNA2 = pd.concat([mRNA_miRNA2, miRNA_RNA_miRNAnotInRNAcentral]).rename(columns={'miR':'RNA1','refseq_id':'RNA2'})
mRNA_miRNA2 = mRNA_miRNA2.fillna('nan')
mRNA_miRNA2['Number_of_experiments'] = (mRNA_miRNA2.groupby(mRNA_miRNA2.columns.tolist()).transform('size'))
mRNA_miRNA2 = mRNA_miRNA2[mRNA_miRNA2['Number_of_experiments'].notna()].drop_duplicates()
mRNA_miRNA2['Source'] = 'miRdSNP'

print(all(mRNA_miRNA2['RNA2'].isin(ensembl_refseq_map['xref'])))

mRNA_miRNA2 = pd.merge(mRNA_miRNA2, rnacentral_map_human_refseq.rename(
    columns={"RefSeq ID":"RNA2"}), on="RNA2").drop(columns=['Label','RNA2','DB','DB ID','Organism','RNA category']).rename(columns={"RNAcentral ID":"RNA2"})

mRNA_miRNA2['diseases'] = mRNA_miRNA2['diseases'].str.lower()
mRNA_miRNA2['diseases'] = mRNA_miRNA2['diseases'].str.split(', ')
mRNA_miRNA2 = mRNA_miRNA2.explode('diseases')

mRNA_miRNA2 = pd.merge(mRNA_miRNA2, disease_map, right_on='0_y', left_on='diseases', how='left')
mRNA_miRNA2['0_x'] = mRNA_miRNA2['0_x'].fillna(mRNA_miRNA2['diseases'])
mRNA_miRNA2 = mRNA_miRNA2.drop(columns=['0_y', 'diseases'])
mRNA_miRNA2 = mRNA_miRNA2.rename(columns={'0_x':'Location'})

RNA_directly_regulates_activity_of_RNA_5 = mRNA_miRNA2.copy()
RNA_directly_regulates_activity_of_RNA_5.rename(columns={'RNA1':':START_ID','RNA2':':END_ID', "SNP":'Mutation'},inplace=True)
RNA_directly_regulates_activity_of_RNA_5.head(n=3)

In [None]:
RNA_directly_regulates_activity_of_RNA = pd.concat([RNA_directly_regulates_activity_of_RNA_1, RNA_directly_regulates_activity_of_RNA_2,
                                                    RNA_directly_regulates_activity_of_RNA_3, RNA_directly_regulates_activity_of_RNA_4,
                                                    RNA_directly_regulates_activity_of_RNA_5])
RNA_directly_regulates_activity_of_RNA['Source'] = RNA_directly_regulates_activity_of_RNA['Source'].str.split(", ")
RNA_directly_regulates_activity_of_RNA = RNA_directly_regulates_activity_of_RNA.explode('Source')
RNA_directly_regulates_activity_of_RNA = RNA_directly_regulates_activity_of_RNA.groupby(
    [':START_ID',':END_ID']).agg({'Source': set, 'Method': set,'Location': set, 'Mutation': set,
                                 'PubMedID': set, 'miRDB_score': np.mean, 'Weighted_CS_score':np.mean}).reset_index()   
RNA_directly_regulates_activity_of_RNA[':TYPE'] = 'directly_regulates_activity_of'
RNA_directly_regulates_activity_of_RNA.to_pickle(unprocessed_edge_data_location+'RNA_directly_regulates_activity_of_RNA.pkl')
RNA_directly_regulates_activity_of_RNA.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0011002 (regulates activity of) - Gene


* [TarpiD](https://tarpid.nitrkl.ac.in/tarpid_db/) <br /> TarpiD is a manually curated database that aims to catalogue published piRNA:gene interactions and their functions, which are predicted computationally or validated experimentally by adopting different methods.

In [None]:
piRNA_gene = []
for i in range(28682):
    url = "https://tarpid.nitrkl.ac.in/tarpid_db/specific_search/pirna_gene_detail.php?tpid=tpd_" + str(i)
    response = requests.get(url)
    html_content = response.text

    soup = BeautifulSoup(html_content, 'html.parser')
    detail_content = soup.find('section', id='detail_content')
    content = detail_content.find_next('div', class_='detail_content_center').text.strip()
    piRNA_gene.append(content)

piRNA_gene = pd.DataFrame([i.split(" ") for i in piRNA_gene]).drop_duplicates().to_csv(
    unprocessed_data_location + 'piRNA-gene.txt', sep='\t', index=None)

#piRNA_gene = pd.read_csv(unprocessed_data_location + 'piRNA-gene.txt', sep="\t")

piRNA_gene = piRNA_gene[(~piRNA_gene["0"].str.contains("sse")) &  (~piRNA_gene["0"].str.contains("mmu")) & (piRNA_gene["0"].str.contains(r'^piR-\d')) &
                        (~piRNA_gene["0"].str.contains("gga")) & (~piRNA_gene["0"].str.contains("ur"))]
piRNA_gene["2"] = piRNA_gene["2"].str.split(":").str[0]
print(piRNA_gene["1"].unique())
piRNA_gene.head(n=3)

In [None]:
piRNA_gene_fixed = piRNA_gene.copy()
piRNA_gene_fixed["0"] = piRNA_gene_fixed["0"].str.replace("piR-", "piR-hsa-")
piRNA_gene_fixed = pd.merge(piRNA_gene_fixed, symbol_entrez_map, left_on="2", right_on=0).drop(
    columns=["2", 0, "1"]).rename(columns={"0":'piRNA', 1:':END_ID'})
piRNA_gene_fixed = pd.merge(piRNA_gene_fixed, rnacentral_map_human_pirbase.rename(columns={'piRBase ID':'piRNA'}), on='piRNA').drop(
    columns=['piRNA']).rename(columns={'RNAcentral ID':':START_ID'})
piRNA_gene_fixed['Source'] = 'TarpiD'
piRNA_gene_fixed.head(n=3)

* [miRTarBase](https://awi.cuhk.edu.cn/~miRTarBase/miRTarBase_2025/php/index.php) <br /> miRTarBase has accumulated more than three hundred and sixty thousand miRNA-target interactions (MTIs), which are collected by manually surveying pertinent literature after NLP of the text systematically to filter research articles related to functional studies of miRNAs.

In [None]:
!wget https://awi.cuhk.edu.cn/~miRTarBase/miRTarBase_2025/cache/download/10.0/miRTarBase_MTI.csv -O ../resources/processed_data/unprocessed_data/miRTarBase_MTI.csv

In [None]:
miRNA_gene = pd.read_csv(unprocessed_data_location + 'miRTarBase_MTI.csv',
                         dtype={'Target Gene (Entrez ID)': 'string', 'References (PMID)': 'string'})
miRNA_gene['Target Gene (Entrez ID)'] = miRNA_gene['Target Gene (Entrez ID)'].str.split('.').str[0]
miRNA_gene['References (PMID)'] = miRNA_gene['References (PMID)'].str.split('.').str[0]
miRNA_gene = miRNA_gene[(miRNA_gene['Species (miRNA)'] == 'hsa') & (miRNA_gene['Species (Target Gene)'] == 'hsa')]
miRNA_gene = miRNA_gene[~miRNA_gene['Support Type'].str.contains('Weak')]
miRNA_gene.drop(columns=['miRTarBase ID','Species (miRNA)','Target Gene','Species (Target Gene)','Support Type'], inplace=True)
miRNA_gene['miRNA'] = miRNA_gene['miRNA'].str.replace('Hsa-', 'hsa-')
miRNA_gene['miRNA'] = miRNA_gene['miRNA'].str.replace(r'^M', 'm', regex=True)

print(all(miRNA_gene['miRNA'].isin(rnacentral_map_human['DB Description'])))
miRNA_gene['miRNA'] = miRNA_gene['miRNA'].str.strip()
print(miRNA_gene[~miRNA_gene['miRNA'].isin(rnacentral_map_human['DB Description'])]['miRNA'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = miRNA_gene[~miRNA_gene['miRNA'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = miRNA_gene[~miRNA_gene['miRNA'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['miRNA'] = miRNA_RNA_miRNAnotInRNAcentral5p['miRNA'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['miRNA'] = miRNA_RNA_miRNAnotInRNAcentral3p['miRNA'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentralhsa = miRNA_RNA_miRNAnotInRNAcentral.copy()
miRNA_RNA_miRNAnotInRNAcentralhsa['miRNA'] = "hsa-" + miRNA_RNA_miRNAnotInRNAcentralhsa['miRNA']
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentralhsa, miRNA_RNA_miRNAnotInRNAcentral3p,
                                            miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'miRNA'}), on='miRNA').drop(columns=['miRNA']).rename(columns={'RNAcentral ID':'RNA'})
miRNA_gene = pd.merge(miRNA_gene, rnacentral_map_human.rename(columns={'DB Description':'miRNA'}), on='miRNA').drop(
    columns=['miRNA']).rename(columns={'RNAcentral ID':'RNA'})
miRNA_gene = pd.concat([miRNA_gene, miRNA_RNA_miRNAnotInRNAcentral]).drop(columns=['DB','DB ID','Organism','RNA category'])
miRNA_gene.rename(columns={'RNA':':START_ID','Target Gene (Entrez ID)':':END_ID'}, inplace=True)

miRNA_gene['References (PMID)'] = pd.to_numeric(miRNA_gene['References (PMID)'], errors='coerce')
miRNA_gene['References (PMID)'] = miRNA_gene['References (PMID)'].astype(str)
miRNA_gene['References (PMID)'] = miRNA_gene['References (PMID)'].str.replace(".0", "")
miRNA_gene['References (PMID)'] = miRNA_gene['References (PMID)'].replace("<NA>", np.nan)

miRNA_gene['Experiments'] = miRNA_gene['Experiments'].str.lower()
miRNA_gene['Experiments'] = miRNA_gene['Experiments'].str.split("\/\/", regex=True)
miRNA_gene = miRNA_gene.explode('Experiments')
miRNA_gene['Experiments'] = miRNA_gene['Experiments'].str.split(";")
miRNA_gene = miRNA_gene.explode('Experiments')
miRNA_gene = pd.merge(miRNA_gene, method_map, right_on='0_y', left_on='Experiments', how='left')
miRNA_gene['0_x'] = miRNA_gene['0_x'].fillna(miRNA_gene['Experiments'])
miRNA_gene = miRNA_gene.drop(columns=['0_y', 'Experiments'])
miRNA_gene = miRNA_gene.rename(columns={'0_x':'Method','References (PMID)':'PubMedID'})

miRNA_gene['Source'] = 'miRTarBase'
miRNA_gene.head(n=3)

* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
!wget https://www.dropbox.com/s/r01ppq5x42v4lyh/miRNet-mir-pseudogene.csv?dl=0 -O ../resources/processed_data/unprocessed_data/miRNet-mir-pseudogene.csv

In [None]:
miRNA_pseudogene = pd.read_csv(unprocessed_data_location+'miRNet-mir-pseudogene.csv')
miRNA_pseudogene.drop(columns=['mirnet','mir_id','entrez','symbol','gene_name','mbv'], inplace=True)

print(all(miRNA_pseudogene['mir_acc'].isin(rnacentral_map_human['DB ID'])))
miRNA_pseudogene = pd.merge(miRNA_pseudogene, rnacentral_map_human.rename(columns={'DB ID':'mir_acc'}), on='mir_acc').drop(
    columns=['DB','Organism','RNA category','DB Description','mir_acc']).rename(columns={'RNAcentral ID':'RNA'})
miRNA_pseudogene['Source'] = 'miRNet'

miRNA_pseudogene = pd.merge(miRNA_pseudogene, ensembl_entrezGene_map.rename(columns={0:'embl',1:':END_ID'}),
                            on='embl').drop(columns=[2,3,4,5,'embl']).rename(columns={'RNA':':START_ID'})
miRNA_pseudogene.head(n=3)

In [None]:
RNA_regulates_activity_of_Gene = pd.concat([miRNA_pseudogene, miRNA_gene, piRNA_gene_fixed])
RNA_regulates_activity_of_Gene = RNA_regulates_activity_of_Gene.groupby([':START_ID',':END_ID']).agg(
    {'Source': set, 'Method': set, 'PubMedID': set}).reset_index()
RNA_regulates_activity_of_Gene[':TYPE'] = 'regulates_activity_of'
RNA_regulates_activity_of_Gene.to_pickle(unprocessed_edge_data_location+'RNA_regulates_activity_of_gene.pkl')
RNA_regulates_activity_of_Gene.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0011002 (regulates activity of) - RNA

* [EpimiR](http://www.jianglab.cn/EpimiR/index.jsp) <br />
The EpimiR database have obtained 1974 regulatory relationships between 19 types of epigenetic modifications (including DNA methylation, histone acetylation, H3K4me3 and H3K27me3, etc.) and 617 miRNAs across 7 species (including Homo sapiens) from nearly 2000 literatures.

In [None]:
!wget https://www.dropbox.com/s/p852ndpck5jasxz/miRNet-mir-epi-hsa.csv?dl=0 -O ../resources/processed_data/unprocessed_data/miRNet-mir-epi-hsa.csv

In [None]:
miRNA_epiMod = pd.read_csv(unprocessed_data_location + 'miRNet-mir-epi-hsa.csv') # Epigenetic modification (SO)
miRNA_epiMod = miRNA_epiMod[miRNA_epiMod['expression'] == 'high']
miRNA_epiMod.drop(columns=['mirnet','mir_id','note','res_type','year','support','detect','expression'], inplace=True)
miRNA_epiMod['epi_modification'] = miRNA_epiMod.epi_modification.str.split('/')
miRNA_epiMod = miRNA_epiMod.explode('epi_modification')

print(all(miRNA_epiMod['mir_acc'].isin(rnacentral_map_human['DB ID'])))
# These are miRBase dead entries
print(miRNA_epiMod[~miRNA_epiMod['mir_acc'].isin(rnacentral_map_human['DB ID'])])
miRNA_epiMod = pd.merge(miRNA_epiMod, rnacentral_map_human.rename(columns={'DB ID':'mir_acc'}), on='mir_acc').drop(
    columns=['DB','Organism','RNA category','DB Description','mir_acc']).rename(columns={'RNAcentral ID':':END_ID'})
miRNA_epiMod.experiment = miRNA_epiMod.experiment.str.replace("in vivo ", 'in vivo')
miRNA_epiMod.experiment = miRNA_epiMod.experiment.str.replace("/vivo", ', in vivo')
miRNA_epiMod.experiment = miRNA_epiMod.experiment.str.replace("/", ', ')
miRNA_epiMod.experiment = miRNA_epiMod.experiment.str.replace(".", ', ')
miRNA_epiMod.epi_regulator = miRNA_epiMod.epi_regulator.str.replace("/", ', ')
miRNA_epiMod.epi_regulator = miRNA_epiMod.epi_regulator.str.lower()
miRNA_epiMod.epi_target = miRNA_epiMod.epi_target.str.lower()
miRNA_epiMod.epi_target = miRNA_epiMod.epi_target.str.replace("/", ', ')
miRNA_epiMod.epi_target = miRNA_epiMod.epi_target.str.strip()
miRNA_epiMod.epi_target = miRNA_epiMod.epi_target.str.replace(" cluster", '')
miRNA_epiMod.epi_target = miRNA_epiMod.epi_target.replace("\\\\\\\\n", np.nan)
miRNA_epiMod.epi_target = miRNA_epiMod.epi_target.replace(r'\(.*?\)', '', regex=True)
miRNA_epiMod.epi_target = miRNA_epiMod.epi_target.replace(r' \(.*?\)', '', regex=True)
miRNA_epiMod.condition = miRNA_epiMod.condition.str.replace('non-small-cell lung cancer', 'non-small cell lung carcinoma')

miRNA_epiMod['pmid'] = pd.to_numeric(miRNA_epiMod['pmid'], errors='coerce')
miRNA_epiMod['pmid'] = miRNA_epiMod['pmid'].astype(str)
miRNA_epiMod['pmid'] = miRNA_epiMod['pmid'].str.replace(".0", "")
miRNA_epiMod['pmid'] = miRNA_epiMod['pmid'].replace("<NA>", np.nan)

miRNA_epiMod['epi_regulator'] = miRNA_epiMod['epi_regulator'].str.split(", ")
miRNA_epiMod = miRNA_epiMod.explode('epi_regulator')

miRNA_epiMod['epi_target'] = miRNA_epiMod['epi_target'].str.split(", ")
miRNA_epiMod = miRNA_epiMod.explode('epi_target')

miRNA_epiMod['experiment'] = miRNA_epiMod['experiment'].str.split(", ")
miRNA_epiMod = miRNA_epiMod.explode('experiment')
miRNA_epiMod = pd.merge(miRNA_epiMod, method_map, right_on='0_y', left_on='experiment', how='left')
miRNA_epiMod['0_x'] = miRNA_epiMod['0_x'].fillna(miRNA_epiMod['experiment'])
miRNA_epiMod = miRNA_epiMod.drop(columns=['0_y', 'experiment'])
miRNA_epiMod = miRNA_epiMod.rename(columns={'0_x':'Method','pmid':'PubMedID','epi_regulator':'Regulator', 'epi_target':'Interactor'})

miRNA_epiMod = pd.merge(miRNA_epiMod, disease_map, right_on='0_y', left_on='condition', how='left')
miRNA_epiMod['0_x'] = miRNA_epiMod['0_x'].fillna(miRNA_epiMod['condition'])
miRNA_epiMod = miRNA_epiMod.drop(columns=['0_y', 'condition'])
miRNA_epiMod = miRNA_epiMod.rename(columns={'0_x':'Location'})

#miRNA_epiMod = miRNA_epiMod.groupby([':END_ID', 'epi_modification','epi_regulator','experiment','epi_target','condition']).agg({'pmid': set}).reset_index()
#miRNA_epiMod['Number_of_experiments'] = miRNA_epiMod['pmid'].apply(len)
miRNA_epiMod['Source'] = 'EpimiR'
miRNA_epiMod2 = miRNA_epiMod.copy()
miRNA_epiMod['Source'] = 'miRNet'
miRNA_epiMod = pd.concat([miRNA_epiMod, miRNA_epiMod2])
miRNA_epiMod.head(n=3)

In [None]:
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.lower().str.replace('dna methylation','silenced by dna methylation')
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.lower().str.replace('h3k4me','h3k4 methylation site')
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.lower().str.replace('h5ac','histone acetylation')
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.lower().str.replace('h3k9me','h3k9 methylation site')
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.lower().str.replace('h3k27me','h3k27 methylation site')
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.lower().str.replace('h3s10p','phosphorylation site')
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.lower().str.replace('h3r17me2','histone methylation site')

miRNA_epiMod = pd.merge(miRNA_epiMod, desc_so_map, left_on='epi_modification', right_on=0)
miRNA_epiMod = miRNA_epiMod.drop(columns=['epi_modification', 0])
miRNA_epiMod = miRNA_epiMod.rename(columns={1:':START_ID'})

miRNA_epiMod = miRNA_epiMod.groupby([':START_ID',':END_ID']).agg({'Source': set, 'Method': set, 'PubMedID': set,
                                                                  'Location': set, 'Regulator': set, 'Interactor': set}).reset_index()
miRNA_epiMod[':TYPE'] = 'regulates_activity_of'
miRNA_epiMod.to_pickle(unprocessed_edge_data_location+'OBO_regulates_activity_of_RNA.pkl')
miRNA_epiMod.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002245 (over-expressed in) - OBO

* [piRBase](http://bigdata.ibp.ac.cn/piRBase/) <br /> piRBase is a database supporting piRNA functional study.

In [None]:
!mkdir ../resources/processed_data/unprocessed_data/piRBase_disease
# http://bigdata.ibp.ac.cn/piRBase/cancer.php --> for each disease (17) in the scroll bar -->
# download html and place in processed_data/unprocessed_data/piRBase_disease/

In [None]:
df = pd.DataFrame()

directory = unprocessed_data_location + "piRBase_disease/"
for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    temp = pd.read_html(filepath)[1]
    df = pd.concat([df, temp])

df.Disease = df.Disease.str.lower()
df.Subtype = df.Subtype.str.lower()
df.Name = df.Name.str.strip()
df['Source'] = 'piRBase'
df = df[['Name', 'Disease', 'Subtype', 'Expression', 'Function', 'PubMed', 'Source']].drop(columns=['Function']) # Mondo+HPO
df = df[~df['Name'].isna()]
df = df[df['Name'].str.startswith('piR-hsa')]
df['Subtype'] = df['Subtype'].fillna(df['Disease'])
df = df.drop(columns=['Disease'])
#df.PubMed = 'https://pubmed.ncbi.nlm.nih.gov/' + df.PubMed.astype('Int64').astype('str')
df = df.rename(columns = {'Subtype':'Disease'})

df = pd.merge(df, rnacentral_map_human_pirbase.rename(columns={'piRBase ID':'Name'}), on='Name').drop(
    columns=['Name']).rename(columns={'RNAcentral ID':'RNA'})

df.Disease = df.Disease.replace('heart failure', 'MONDO_0005252')
df.Disease = df.Disease.replace('cardiac hypertrophy', 'HP_0001714')
df.Disease = df.Disease.replace('multiple myeloma', 'HP_0006775')
df.Disease = df.Disease.replace('gastric cancer', 'MONDO_0001056')
df.Disease = df.Disease.replace('breast cancer', 'MONDO_0007254')
df.Disease = df.Disease.replace('liver cancer', 'MONDO_0002691')
df.Disease = df.Disease.replace('myeloma', 'MONDO_0005170')
df.Disease = df.Disease.replace('bladder cancer', 'MONDO_0004986')
df.Disease = df.Disease.replace('colorectal cancer', 'MONDO_0005575')
df.Disease = df.Disease.replace('pancreas cancer', 'MONDO_0005192')
df.Disease = df.Disease.replace('kidney cancer', 'MONDO_0002367')
df.Disease = df.Disease.replace('cardiovascular diseases', 'MONDO_0004995')
df.Disease = df.Disease.replace('alzheimer', 'MONDO_0004975')
df.Disease = df.Disease.replace('thyroid cancer', 'MONDO_0002108')
df.Disease = df.Disease.replace('lung cancer', 'MONDO_0008903')
df.Disease = df.Disease.replace('prostate cancer', 'MONDO_0008315')
df.Disease = df.Disease.replace('parkinson', 'MONDO_0005180')
df.Disease = df.Disease.replace('glioblastoma', 'MONDO_0018177')
df.Disease = df.Disease.replace('ovarian cancer', 'MONDO_0008170')

print(df.Expression.unique())
df[['Regulation', 'Fold_Change']] = df['Expression'].str.extract(r'(up-regulated|down-regulated)\s*FC\s*(\d*\.?\d+)?')
df['Fold_Change'] = pd.to_numeric(df['Fold_Change'], errors='coerce')
df.drop(columns=['Expression'], inplace=True)

df['PubMed'] = pd.to_numeric(df['PubMed'], errors='coerce')
df['PubMed'] = df['PubMed'].astype(str)
df['PubMed'] = df['PubMed'].str.replace(".0", "")
df['PubMed'] = df['PubMed'].replace("<NA>", np.nan)

df.head(n=3)

In [None]:
df_up = df[df['Regulation'] == 'up-regulated']
df_up.rename(columns={'RNA':':START_ID','Disease':':END_ID', 'PubMed':'PubMedID'},inplace=True)
df_up.head(n=3)

* [iPiDA-GCN](http://bliulab.net/iPiDA-GCN/) <br /> iPiDA-GCN is a computational method called for piRNA-disease association identification based on graph convolutional networks (GCNs).

In [None]:
url = "http://bliulab.net/iPiDA-GCN/static/download/piRNA_disease_information.zip" # Mondo+HPO
filename = "piRNA_disease_information.zip"

response = requests.get(url)
with open(filename, "wb") as file:
    file.write(response.content)

with zipfile.ZipFile(filename, "r") as zip_ref:
    zip_ref.extractall(unprocessed_data_location)

In [None]:
pirna_disease2 = pd.DataFrame(np.load(unprocessed_data_location + "piRNA_disease_information/adjPD.npy"))
pirna_disease2.head(n=3)

In [None]:
disease_info = pd.read_excel(unprocessed_data_location + "piRNA_disease_information/disease_information.xlsx")
disease_info['DOID'] = disease_info['DOID'].str.replace(':','_')
disease_info = disease_info.drop(11)
disease_info.head(n=3)

In [None]:
colnames = pd.merge(disease_info, doid_mondo_map.rename(columns={0:'DOID'}), on='DOID')[1]
pirna_disease2.columns=colnames
pirna_disease2.head(n=3)

In [None]:
pirna_info = pd.read_excel(unprocessed_data_location + "piRNA_disease_information/piRNA_information.xlsx")[["ncRNA Symbol","pi_num"]]
pirna_info.head(n=3)

In [None]:
pirna_disease2 = pd.merge(pirna_disease2, pirna_info, left_index=True, right_on="pi_num").set_index("ncRNA Symbol")
pirna_disease2.head(n=3)

In [None]:
pirna_disease_list = pd.DataFrame(columns = ["piRNA", "dis"])

for disease in pirna_disease2.columns:    
    piRNA=[]
    for index, row in pirna_disease2.iterrows():
         if row[disease] == 1.0:
            piRNA.append(index)
    
    df = pd.DataFrame(piRNA, columns = ['piRNA'])
    df['dis'] = disease
    
    pirna_disease_list = pd.concat([pirna_disease_list, df], ignore_index=True)
    
pirna_disease_list.head(n=3)

In [None]:
pirna_disease_list = pd.merge(pirna_disease_list, rnacentral_map_human_pirbase.rename(columns={'piRBase ID':'piRNA'}), on='piRNA').drop(
    columns=['piRNA']).rename(columns={'RNAcentral ID':':START_ID','dis':':END_ID'})

#pirna_disease_list['Number_of_experiments'] = (pirna_disease_list.groupby(pirna_disease_list.columns.tolist()).transform('size'))
#pirna_disease_list = pirna_disease_list[pirna_disease_list['Number_of_experiments'].notna()].drop_duplicates()
pirna_disease_list['Source'] = 'iPiDA-GCN'
pirna_disease_list.head(n=3) 

* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv') # PW
lncRNA_pw = LncRNAWiki[LncRNAWiki['pathway'].notna()]
lncRNA_pw.pathway = lncRNA_pw.pathway.str.lower()
lncRNA_pw = lncRNA_pw.drop(columns=['symbol','synonyms','gene_locus','gene_id','conservation_ortholog','clinical_detail','epigenetic_modification',
                                        'modification_detail','target_interaction', 'conservation_species','target_effect','biological_process',
                                        'description','conservation','target_type','biological_context','regulator_effect',
                                        'regulator_interaction', 'genome_variation', 'variation_detail', 'molecular_function',
                                        'expression','regulator_type','functional_mechanism'])

lncRNA_pw['transcript_id'] = lncRNA_pw['transcript_id'].str.split(',')
lncRNA_pw = lncRNA_pw.explode('transcript_id')
lncRNA_pw = lncRNA_pw[lncRNA_pw['transcript_id'].notna()]

lncRNA_pw['pathway'] = lncRNA_pw['pathway'].str.split(',')
lncRNA_pw = lncRNA_pw.explode('pathway')
lncRNA_pw = lncRNA_pw[lncRNA_pw['pathway'].notna()]

lncRNA_pw = pd.merge(desc_pw_map.rename(columns={0:'pathway'}), lncRNA_pw, on=['pathway']).drop(
    columns=['pathway']).rename(columns={1:'Pathway'})

lncRNA_pw = pd.merge(lncRNA_pw, rnacentral_map_human_lncbook[['LncBook Transcript ID', 'RNAcentral ID']].drop_duplicates().rename(
    columns={'LncBook Transcript ID':'transcript_id'}), on = 'transcript_id').drop(columns=['transcript_id']).rename(
        columns={'RNAcentral ID':'RNA'})  

lncRNA_pw = lncRNA_pw[lncRNA_pw['RNA'].notna()]

lncRNA_pw['pmid'] = pd.to_numeric(lncRNA_pw['pmid'], errors='coerce')
lncRNA_pw['pmid'] = lncRNA_pw['pmid'].astype(str)
lncRNA_pw['pmid'] = lncRNA_pw['pmid'].str.replace(".0", "")
lncRNA_pw['pmid'] = lncRNA_pw['pmid'].replace("<NA>", np.nan)

lncRNA_pw['drug'] = lncRNA_pw['drug'].str.lower().str.split(";")
lncRNA_pw = lncRNA_pw.explode('drug')

lncRNA_pw['regulator'] = lncRNA_pw['regulator'].str.lower().str.split(";")
lncRNA_pw = lncRNA_pw.explode('regulator')
lncRNA_pw['target'] = lncRNA_pw['target'].str.lower().str.split(";")
lncRNA_pw = lncRNA_pw.explode('target')

lncRNA_pw['experimental_method'] = lncRNA_pw['experimental_method'].str.lower().str.split(";")
lncRNA_pw = lncRNA_pw.explode('experimental_method')
lncRNA_pw = pd.merge(lncRNA_pw, method_map, right_on='0_y', left_on='experimental_method', how='left')
lncRNA_pw['0_x'] = lncRNA_pw['0_x'].fillna(lncRNA_pw['experimental_method'])
lncRNA_pw = lncRNA_pw.drop(columns=['0_y', 'experimental_method'])
lncRNA_pw = lncRNA_pw.rename(columns={'0_x':'Method','pmid':'PubMedID', 'drug':'Drug', 'regulator':'Regulator', 'target':'Interactor'})

lncRNA_pw['tissue/cell line'] = lncRNA_pw['tissue/cell line'].str.lower().str.split(";")
lncRNA_pw = lncRNA_pw.explode('tissue/cell line')
lncRNA_pw = pd.merge(lncRNA_pw, location_map, right_on='0_y', left_on='tissue/cell line', how='left')
lncRNA_pw['0_x'] = lncRNA_pw['0_x'].fillna(lncRNA_pw['tissue/cell line'])
lncRNA_pw = lncRNA_pw.drop(columns=['0_y', 'tissue/cell line'])
lncRNA_pw = lncRNA_pw.rename(columns={'0_x':'Location'})

lncRNA_pw['context_detail'] = lncRNA_pw['context_detail'].str.lower().str.split(";")
lncRNA_pw = lncRNA_pw.explode('context_detail')
lncRNA_pw = pd.merge(lncRNA_pw, disease_map, right_on='0_y', left_on='context_detail', how='left')
lncRNA_pw['0_x'] = lncRNA_pw['0_x'].fillna(lncRNA_pw['context_detail'])
lncRNA_pw = lncRNA_pw.drop(columns=['0_y', 'context_detail'])
lncRNA_pw = lncRNA_pw.rename(columns={'0_x':'Location2'})

lncRNA_pw = pd.concat([lncRNA_pw.drop(columns=['Location2']), lncRNA_pw.drop(columns=['Location']).rename(columns={'Location2':'Location'})])

lncRNA_pw['Source'] = 'LncRNAWiki'
lncRNA_pw.rename(columns={'RNA':':START_ID','Pathway':':END_ID'}, inplace=True)
lncRNA_pw.head(n=3) 

In [None]:
lncRNA_pw_up = lncRNA_pw[lncRNA_pw['expression_detail'] == 'Up-regulated'].drop(columns=['expression_detail'])
lncRNA_pw_up.head(n=3)

* [LncBook](https://ngdc.cncb.ac.cn/lncbook/)

In [None]:
! wget https://ngdc.cncb.ac.cn/lncbook/files/expression_LncBook2.0.csv.gz -O ../resources/processed_data/unprocessed_data/expression_LncBook2.0.csv.gz

In [None]:
lncRNA_expression = pd.read_csv(unprocessed_data_location + 'expression_LncBook2.0.csv.gz') # Biological context (Uberon+GO+CLO+Mondo+GO)
lncRNA_expression.drop(columns=['Symbol','Featured Expression'],inplace=True)
lncRNA_expression.rename(columns={'Normal Tissue/Cell Line': 'UBERON_0000479',
                                 'Organ Development':'GO_0048513',
                                 'Preimplantation Embryo':'GO_0007566',
                                 'Cell Differentiation':'GO_0030154',
                                 'Subcellular Localization':'GO_0051179',
                                 'Exosome':'GO_0070062',
                                 'Cancer Cell Line':'CLO_0009828',
                                 'Virus Infection':'MONDO_0005108',
                                 'Circadian Rhythm':'GO_0007623'},inplace=True)

lncRNA_expression = pd.merge(lncRNA_expression, rnacentral_map_human_lncbook[['RNAcentral ID', 'LncBook Gene ID']].drop_duplicates().rename(
    columns={'LncBook Gene ID':'Gene ID'}), on = 'Gene ID').drop(columns=['Gene ID']).rename(columns={'RNAcentral ID':'RNA'})
lncRNA_expression.head(n=3)

In [None]:
# HC
HCfinal=pd.DataFrame()
# NE
NEfinal=pd.DataFrame()
# MC
MCfinal=pd.DataFrame()
# LC
LCfinal=pd.DataFrame()

for i in ['UBERON_0000479','GO_0048513','GO_0007566','GO_0030154','GO_0051179',
          'GO_0070062','CLO_0009828','MONDO_0005108','GO_0007623']:
    HC = lncRNA_expression[['RNA','Expression Capacity',i]][lncRNA_expression[['RNA','Expression Capacity',i]][i]=='HC']
    HC[i]=i
    HC.rename(columns={i:'HC'},inplace=True)
    HCfinal = pd.concat([HCfinal,HC])
    
    NE = lncRNA_expression[['RNA','Expression Capacity',i]][lncRNA_expression[['RNA','Expression Capacity',i]][i]=='NE']
    NE[i]=i
    NE.rename(columns={i:'NE'},inplace=True)
    NEfinal = pd.concat([NEfinal,NE])
    
    MC = lncRNA_expression[['RNA','Expression Capacity',i]][lncRNA_expression[['RNA','Expression Capacity',i]][i]=='MC']
    MC[i]=i
    MC.rename(columns={i:'MC'},inplace=True)
    MCfinal = pd.concat([MCfinal,MC])

    LC = lncRNA_expression[['RNA','Expression Capacity',i]][lncRNA_expression[['RNA','Expression Capacity',i]][i]=='LC']
    LC[i]=i
    LC.rename(columns={i:'LC'},inplace=True)
    LCfinal = pd.concat([LCfinal,LC])
 
HCfinal = HCfinal.rename(columns={'HC':':END_ID', 'RNA':':START_ID'}).drop(columns='Expression Capacity')
LCfinal = LCfinal.rename(columns={'LC':':END_ID', 'RNA':':START_ID'}).drop(columns='Expression Capacity')
MCfinal = MCfinal.rename(columns={'MC':':END_ID', 'RNA':':START_ID'}).drop(columns='Expression Capacity')
NEfinal = NEfinal.rename(columns={'NE':':END_ID', 'RNA':':START_ID'}).drop(columns='Expression Capacity')

'''
HCfinal['Number_of_experiments'] = (HCfinal.groupby(HCfinal.columns.tolist()).transform('size'))
HCfinal = HCfinal[HCfinal['Number_of_experiments'].notna()].drop_duplicates()
LCfinal['Number_of_experiments'] = (LCfinal.groupby(LCfinal.columns.tolist()).transform('size'))
LCfinal = LCfinal[LCfinal['Number_of_experiments'].notna()].drop_duplicates()
MCfinal['Number_of_experiments'] = (MCfinal.groupby(MCfinal.columns.tolist()).transform('size'))
MCfinal = MCfinal[MCfinal['Number_of_experiments'].notna()].drop_duplicates()
NEfinal['Number_of_experiments'] = (NEfinal.groupby(NEfinal.columns.tolist()).transform('size'))
NEfinal = NEfinal[NEfinal['Number_of_experiments'].notna()].drop_duplicates()
'''

HCfinal['Source'] = 'LncBook'
LCfinal['Source'] = 'LncBook'
MCfinal['Source'] = 'LncBook'
NEfinal['Source'] = 'LncBook'

HCfinal.head(n=3) 

* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
!wget https://www.dropbox.com/s/o27wz2kg9co76mo/miRNet-mir-disease.csv?dl=0 -O ../resources/processed_data/unprocessed_data/miRNet-mir-disease.csv

In [None]:
mirnet = pd.read_csv(unprocessed_data_location + "miRNet-mir-disease.csv").drop(columns=['mirnet','mir_id']) # Mondo+HPO
mirnet.disease = mirnet.disease.str.lower()
mirnet.pmid = mirnet.pmid.str.split('|')
mirnet = mirnet.explode('pmid')
mirnet.evidence = mirnet.evidence.str.split(', ')
mirnet = mirnet.explode('evidence')
mirnet.database = mirnet.database.str.replace(" v3.2", "").str.replace(" v2.0", "")
mirnet.database = mirnet.database.str.split(', ')
mirnet['database'] = mirnet['database'].apply(lambda x: set(x))
mirnet.method = mirnet.method.str.replace(", HMDD v3.2", "").str.replace(
    ", HMDD v2.0", "").str.replace("HMDD v3.2", "").str.replace("HMDD v2.0", "")

mirnet.method = mirnet.method.str.lower()
mirnet.method = mirnet.method.str.split(', ')
mirnet = mirnet.explode('method')
mirnet = pd.merge(mirnet, method_map, right_on='0_y', left_on='method', how='left')
mirnet['0_x'] = mirnet['0_x'].fillna(mirnet['method'])
mirnet = mirnet.drop(columns=['0_y', 'method'])
mirnet = mirnet.rename(columns={'0_x':'Method'})

print(all(mirnet['mir_acc'].isin(rnacentral_map_human_mirbase['miRBase ID'])))
# These are all miRBase dead entries
print(mirnet[~mirnet['mir_acc'].isin(rnacentral_map_human_mirbase['miRBase ID'])].head(n=3))
mirnet = pd.merge(mirnet, rnacentral_map_human_mirbase.rename(columns={'miRBase ID':'mir_acc'}), on='mir_acc').drop(
    columns=['mir_acc']).rename(columns={'RNAcentral ID':'RNA'})

# miRBase and HMDD entries are not updated in miRNet. We will use the latest version of the databases
mirnet['database'] = mirnet['database'].astype(str)
mirnet = mirnet[(mirnet['database']!="{'HMDD'}")]
mirnet = mirnet[(mirnet['database']!="{'miR2Disease'}")]
mirnet['database'] = mirnet['database'].str.replace('\{', '', regex=True)
mirnet['database'] = mirnet['database'].str.replace('\}', '', regex=True)
mirnet['database'] = mirnet['database'].str.replace('\'', '', regex=True)
mirnet['database'] = mirnet['database'].astype(str) +', miRNet'
mirnet['database'] = mirnet['database'].str.split(', ')
mirnet = mirnet.explode('database')

desc_to_doid_mi2disease = pd.read_csv("http://watson.compbio.iupui.edu:8080/miR2Disease/download/diseaseList.txt",
                                      sep="\t", names=[1,0], skiprows=1)
desc_to_doid_mi2disease[0] = desc_to_doid_mi2disease[0].str.replace(':', '_')
desc_to_doid_mi2disease[1] = desc_to_doid_mi2disease[1].str.replace(r'\(.*?\)', '', regex=True).str.strip().str.lower()
desc_to_mondo_mir2disease = pd.merge(desc_to_doid_mi2disease, doid_mondo_map, on=0)[['1_x','1_y']].rename(columns={'1_x':0,'1_y':1})
desc_to_mondo_mir2disease2 = pd.merge(desc_to_doid_mi2disease, desc_disPhe_map.rename(
    columns={0:1,1:0}), on=1)[[1,'0_y']].rename(columns={1:0,'0_y':1})
desc_to_mondo_mir2disease = pd.concat([desc_to_mondo_mir2disease, desc_to_mondo_mir2disease2]).drop_duplicates()
mirnet = mirnet.merge(desc_to_mondo_mir2disease, left_on='disease', right_on=0).drop(
    columns=[0,'disease']).rename(columns={1:'Disease'})
#mirnet = mirnet.groupby(['RNA', 'Disease', 'method', 'evidence', 'database']).agg({'pmid': set}).reset_index()#
#mirnet['Number_of_experiments'] = mirnet['pmid'].apply(len)

mirnet['pmid'] = pd.to_numeric(mirnet['pmid'], errors='coerce')
mirnet['pmid'] = mirnet['pmid'].astype(str)
mirnet['pmid'] = mirnet['pmid'].str.replace(".0", "")
mirnet['pmid'] = mirnet['pmid'].replace("<NA>", np.nan)

mirnet.rename(columns={'RNA':':START_ID','Disease':':END_ID', 'pmid':'PubMedID', 'database':'Source'}, inplace=True)
mirnet.head(n=3)

In [None]:
mirnet_up = mirnet[mirnet['evidence'].str.contains('overexpr')].drop(columns=['evidence'])
mirnet_up.head(n=3)

* [miR2Disease](http://watson.compbio.iupui.edu:8080/miR2Disease/) <br />miR2Disease is a manually curated database that aims at providing a comprehensive resource of miRNA deregulation in various human diseases.

In [None]:
data_downloader('http://watson.compbio.iupui.edu:8080/miR2Disease/download/AllEntries.txt', unprocessed_data_location) # Mondo+HPO

In [None]:
mir2disease = pd.read_csv(unprocessed_data_location + 'AllEntries.txt', sep="\t", header=None).drop(columns=[4,5])
mir2disease[1] = mir2disease[1].str.lower()
mir2disease.rename(columns={0: 'mir_id', 1: 'disease'}, inplace=True)

desc_to_doid_mi2disease = pd.read_csv("http://watson.compbio.iupui.edu:8080/miR2Disease/download/diseaseList.txt",
                                      sep="\t", names=[1,0], skiprows=1)
desc_to_doid_mi2disease[0] = desc_to_doid_mi2disease[0].str.replace(':', '_')
desc_to_doid_mi2disease[1] = desc_to_doid_mi2disease[1].str.replace(r'\(.*?\)', '', regex=True).str.strip().str.lower()
desc_to_mondo_mir2disease = pd.merge(desc_to_doid_mi2disease, doid_mondo_map, on=0)[['1_x','1_y']].rename(columns={'1_x':0,'1_y':1})
desc_to_mondo_mir2disease2 = pd.merge(desc_to_doid_mi2disease, desc_disPhe_map.rename(
    columns={0:1,1:0}), on=1)[[1,'0_y']].rename(columns={1:0,'0_y':1})
desc_to_mondo_mir2disease = pd.concat([desc_to_mondo_mir2disease, desc_to_mondo_mir2disease2]).drop_duplicates()
mir2disease = mir2disease.merge(desc_to_mondo_mir2disease, left_on='disease', right_on=0).drop(
    columns=[0,'disease']).rename(columns={1:'Disease'})

print(all(mir2disease['mir_id'].isin(rnacentral_map_human['DB Description'])))
mir2disease['mir_id'] = mir2disease['mir_id'].str.strip()
print(mir2disease[~mir2disease['mir_id'].isin(rnacentral_map_human['DB Description'])]['mir_id'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = mir2disease[~mir2disease['mir_id'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = mir2disease[~mir2disease['mir_id'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['mir_id'] = miRNA_RNA_miRNAnotInRNAcentral5p['mir_id'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['mir_id'] = miRNA_RNA_miRNAnotInRNAcentral3p['mir_id'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'mir_id'}), on='mir_id').drop(columns=['mir_id']).rename(columns={'RNAcentral ID':':START_ID'})
mir2disease = pd.merge(mir2disease, rnacentral_map_human.rename(columns={'DB Description':'mir_id'}), on='mir_id').drop(
    columns=['mir_id']).rename(columns={'RNAcentral ID':':START_ID'})
mir2disease = pd.concat([mir2disease, miRNA_RNA_miRNAnotInRNAcentral]).drop(columns=['DB','DB ID','Organism','RNA category'])
#mir2disease = mir2disease.fillna('nan')
#mir2disease['Number_of_experiments'] = (mir2disease.groupby(mir2disease.columns.tolist()).transform('size'))
#mir2disease = mir2disease[mir2disease['Number_of_experiments'].notna()].drop_duplicates()

mir2disease[3] = mir2disease[3].str.lower()
mir2disease[3] = mir2disease[3].str.split(', ')
mir2disease = mir2disease.explode(3)
mir2disease = pd.merge(mir2disease, method_map, right_on='0_y', left_on=3, how='left')
mir2disease['0_x'] = mir2disease['0_x'].fillna(mir2disease[3])
mir2disease = mir2disease.drop(columns=['0_y', 3])
mir2disease = mir2disease.rename(columns={'0_x':'Method'})

mir2disease['Source'] = 'miR2Disease'
mir2disease = mir2disease.rename(columns={'Disease':':END_ID'})
mir2disease.head(n=3)

In [None]:
mir2disease[2].unique()

In [None]:
mir2disease_up = mir2disease[mir2disease[2] == 'up-regulated'].drop(columns=[2])
mir2disease_up.head(n=3)

* [HMDD](https://www.cuilab.cn/hmdd) <br /> HMDD (the Human microRNA Disease Database) is a database that curated experiment-supported evidence for human microRNA (miRNA) and disease associations. miRNAs are one class of important regulatory RNAs, which mainly repress gene express at the post-transcriptional level.

In [None]:
!wget http://www.cuilab.cn/static/hmdd3/data/alldata_v4.xlsx --no-check-certificate -O ../resources/processed_data/unprocessed_data/alldata_v4.xlsx

In [None]:
hmdd = pd.read_excel(unprocessed_data_location+'alldata_v4.xlsx').drop(columns=['description']) # Mondo+HPO
hmdd.disease = hmdd.disease.str.lower().str.strip()
hmdd = hmdd[hmdd['disease'].notna()]

hmdd = hmdd.merge(desc_disPhe_map, left_on='disease', right_on=0).drop(
    columns=[0,'disease']).rename(columns={1:'Disease'})

print(all(hmdd['miRNA'].isin(rnacentral_map_human['DB Description'])))
hmdd['miRNA'] = hmdd['miRNA'].str.strip()
hmdd['miRNA'] = hmdd['miRNA'].str.replace('EBV', 'ebv')
hmdd['miRNA'] = hmdd['miRNA'].str.replace('HBV', 'hbv')
print(hmdd[~hmdd['miRNA'].isin(rnacentral_map_human['DB Description'])]['miRNA'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = hmdd[~hmdd['miRNA'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = hmdd[~hmdd['miRNA'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['miRNA'] = miRNA_RNA_miRNAnotInRNAcentral5p['miRNA'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['miRNA'] = miRNA_RNA_miRNAnotInRNAcentral3p['miRNA'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'miRNA'}), on='miRNA').drop(columns=['miRNA']).rename(columns={'RNAcentral ID':'RNA'})
hmdd = pd.merge(hmdd, rnacentral_map_human.rename(columns={'DB Description':'miRNA'}), on='miRNA').drop(
    columns=['miRNA']).rename(columns={'RNAcentral ID':'RNA'})
hmdd = pd.concat([hmdd, miRNA_RNA_miRNAnotInRNAcentral]).drop(columns=['DB','DB ID','Organism','RNA category'])
#hmdd = hmdd.fillna('nan')
#hmdd['Number_of_experiments'] = (hmdd.groupby(hmdd.columns.tolist()).transform('size'))
#hmdd = hmdd[hmdd['Number_of_experiments'].notna()].drop_duplicates()

hmdd['PMID'] = pd.to_numeric(hmdd['PMID'], errors='coerce')
hmdd['PMID'] = hmdd['PMID'].astype(str)
hmdd['PMID'] = hmdd['PMID'].str.replace(".0", "")
hmdd['PMID'] = hmdd['PMID'].replace("<NA>", np.nan)

hmdd.rename(columns={'RNA':':START_ID','Disease':':END_ID','PMID':'PubMedID'}, inplace=True)
hmdd['Source'] = 'HMDD' 
hmdd.head(n=3)

In [None]:
hmdd.code.unique()[:5]

In [None]:
hmdd_up = hmdd[hmdd['code'] == 'tissue_expression_up'].drop(columns=['code'])
hmdd_up.head(n=3)

* [dbDEMC](https://www.biosino.org/dbDEMC/index) <br /> dbDEMC (database of Differentially Expressed MiRNAs in human Cancers) is an integrated database that designed to store and display differentially expressed microRNAs (miRNAs) in cancers.

In [None]:
! wget https://www.biosino.org/dbDEMC/download/MiRExpAll -O ../resources/processed_data/unprocessed_data/MiRExpAll

In [None]:
dbdemc = pd.read_csv(unprocessed_data_location+"MiRExpAll", sep="\t")
dbdemc = dbdemc[dbdemc.Species.str.contains("apiens")]
dbdemc['CancerSubtype'] = dbdemc['CancerSubtype'].fillna(dbdemc['CancerType'])
dbdemc = dbdemc.drop(columns=['ExperimentID','logFC','AveExpr','miRNA_ID','SourceDataID','ExperimentalDesign',
                                              'Tvalue','Pvalue','Bvalue','Species','CancerType'])
dbdemc = dbdemc[dbdemc['adjPvalue']<.01]
dbdemc.CancerSubtype = dbdemc.CancerSubtype.str.lower().str.strip()

dbdemc = dbdemc.merge(desc_disPhe_map, left_on='CancerSubtype', right_on=0).drop(
    columns=[0,'CancerSubtype']).rename(columns={1:'Disease'})

print(all(dbdemc['miRBaseID'].isin(rnacentral_map_human['DB Description'])))
print(dbdemc[~dbdemc['miRBaseID'].isin(rnacentral_map_human['DB Description'])]['miRBaseID'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = dbdemc[~dbdemc['miRBaseID'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = dbdemc[~dbdemc['miRBaseID'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['miRBaseID'] = miRNA_RNA_miRNAnotInRNAcentral5p['miRBaseID'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['miRBaseID'] = miRNA_RNA_miRNAnotInRNAcentral3p['miRBaseID'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'miRBaseID'}), on='miRBaseID').drop(columns=['miRBaseID']).rename(columns={'RNAcentral ID':'RNA'})
dbdemc_map = rnacentral_map_human[rnacentral_map_human['DB Description'].isin(dbdemc['miRBaseID']) &
                    (~rnacentral_map_human['DB Description'].isna())][['DB Description','RNAcentral ID']].drop_duplicates()
dbdemc_map = dbdemc_map.rename(columns={'DB Description':'miRBaseID'})
dbdemc = pd.merge(dbdemc, dbdemc_map, on='miRBaseID').drop(
        columns=['miRBaseID']).rename(columns={'RNAcentral ID':'RNA'})
print(dbdemc.head(n=3))
dbdemc = pd.concat([dbdemc, miRNA_RNA_miRNAnotInRNAcentral]).drop(columns=['DB','DB ID','Organism','RNA category'])

dbdemc['Cellline'] = dbdemc['Cellline'].str.lower().str.split(";")
dbdemc = dbdemc.explode('Cellline')
dbdemc = pd.merge(dbdemc, location_map, right_on='0_y', left_on='Cellline', how='left')
dbdemc['0_x'] = dbdemc['0_x'].fillna(dbdemc['Cellline'])
dbdemc = dbdemc.drop(columns=['0_y', 'Cellline'])
dbdemc = dbdemc.rename(columns={'0_x':'Location'})

dbdemc.rename(columns={'RNA':':START_ID','Disease':':END_ID', 'adjPvalue':'FDR'}, inplace=True)
dbdemc['Source'] = 'dbDEMC' 
dbdemc.head(n=3)

In [None]:
dbdemc_up = dbdemc[dbdemc['Status'] == 'UP'].drop(columns=['Status'])
dbdemc_up.head(n=3)

* miRCancer

In [None]:
mircancer = pd.read_csv(unprocessed_data_location + 'miRCancerJune2020.txt',sep='\t', encoding='latin1')
'''
mircancer_ref = pd.DataFrame(mircancer['PubMed Article'].unique(), columns=["PubMed Article"])

Entrez.email = 'emanuelecavalleri@email.com'
def convert_to_pmid(article_title):
    handle = Entrez.esearch(db="pubmed", term=article_title)
    record = Entrez.read(handle)
    handle.close()
    if record["IdList"]:
        return record["IdList"][0]
    else:
        return None

# NCBI works with some limits
mircancer_ref = np.array_split(mircancer_ref, 7)
mircancer_ref[0]["PMID"] = mircancer_ref[0]["PubMed Article"].apply(convert_to_pmid)
mircancer_ref[0].to_csv(unprocessed_data_location + 'miRCancerJune2020_ref0.txt', sep='\t', index=None)
mircancer_ref[1]["PMID"] = mircancer_ref[1]["PubMed Article"].apply(convert_to_pmid)
mircancer_ref[1].to_csv(unprocessed_data_location + 'miRCancerJune2020_ref1.txt', sep='\t', index=None)
mircancer_ref[2]["PMID"] = mircancer_ref[2]["PubMed Article"].apply(convert_to_pmid)
mircancer_ref[2].to_csv(unprocessed_data_location + 'miRCancerJune2020_ref2.txt', sep='\t', index=None)
mircancer_ref[3]["PMID"] = mircancer_ref[3]["PubMed Article"].apply(convert_to_pmid)
mircancer_ref[3].to_csv(unprocessed_data_location + 'miRCancerJune2020_ref3.txt', sep='\t', index=None)
mircancer_ref[4]["PMID"] = mircancer_ref[4]["PubMed Article"].apply(convert_to_pmid)
mircancer_ref[4].to_csv(unprocessed_data_location + 'miRCancerJune2020_ref4.txt', sep='\t', index=None)
mircancer_ref[5]["PMID"] = mircancer_ref[5]["PubMed Article"].apply(convert_to_pmid)
mircancer_ref[5].to_csv(unprocessed_data_location + 'miRCancerJune2020_ref5.txt', sep='\t', index=None)
mircancer_ref[6]["PMID"] = mircancer_ref[6]["PubMed Article"].apply(convert_to_pmid)
mircancer_ref[6].to_csv(unprocessed_data_location + 'miRCancerJune2020_ref6.txt', sep='\t', index=None)
mircancer_ref = pd.concat(mircancer_ref)
mircancer_ref.to_csv(unprocessed_data_location + 'miRCancerJune2020_ref.txt', sep='\t', index=None)
'''
mircancer_ref = pd.read_csv(unprocessed_data_location + 'miRCancerJune2020_ref.txt', sep='\t')

mircancer = pd.merge(mircancer, mircancer_ref, on='PubMed Article', how='left')

mircancer = mircancer.merge(desc_disPhe_map, left_on='Cancer', right_on=0).drop(
    columns=[0,'Cancer']).rename(columns={1:'Disease'})

print(all(mircancer['mirId'].isin(rnacentral_map_human['DB Description'])))
print(mircancer[~mircancer['mirId'].isin(rnacentral_map_human['DB Description'])]['mirId'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = mircancer[~mircancer['mirId'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = mircancer[~mircancer['mirId'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['mirId'] = miRNA_RNA_miRNAnotInRNAcentral5p['mirId'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['mirId'] = miRNA_RNA_miRNAnotInRNAcentral3p['mirId'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentralhsa = miRNA_RNA_miRNAnotInRNAcentral.copy()
miRNA_RNA_miRNAnotInRNAcentralhsa['mirId'] = 'hsa-' + miRNA_RNA_miRNAnotInRNAcentralhsa['mirId'].astype(str)
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral, miRNA_RNA_miRNAnotInRNAcentralhsa])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'mirId'}), on='mirId').drop(columns=['mirId']).rename(columns={'RNAcentral ID':':START_ID'})

mircancer = pd.merge(mircancer, rnacentral_map_human.rename(
    columns={'DB Description':'mirId'}), on='mirId').drop(
        columns=['DB','DB ID','mirId','Organism','RNA category']).rename(columns={'RNAcentral ID':':START_ID'})
mircancer = pd.concat([mircancer, miRNA_RNA_miRNAnotInRNAcentral]).drop(columns=['DB','DB ID','Organism','RNA category'])
#mircancer = mircancer.fillna('nan')
#mircancer = mircancer.groupby([':START_ID', 'Disease', 'Profile']).agg({'PMID': set}).reset_index()
#mircancer['Number_of_experiments'] = mircancer['PMID'].apply(len)

mircancer['PMID'] = pd.to_numeric(mircancer['PMID'], errors='coerce')
mircancer['PMID'] = mircancer['PMID'].astype(str)
mircancer['PMID'] = mircancer['PMID'].str.replace(".0", "")
mircancer['PMID'] = mircancer['PMID'].replace("<NA>", np.nan)

mircancer.rename(columns={'Disease':':END_ID', 'PMID':'PubMedID'}, inplace=True)
mircancer['Source'] = 'miRCancer'
mircancer.head(n=3)

In [None]:
mircancer_up = mircancer[mircancer['Profile'].str.contains('up')].drop(columns=['Profile','PubMed Article'])
mircancer_up.head(n=3)

* [ncRDeathDB](https://www.rna-society.org/ncrdeathdb/)

In [None]:
! wget https://www.rna-society.org/ncrdeathdb/data/allNcRNACelldeathData.xlsx -O ../resources/processed_data/unprocessed_data/allNcRNACelldeathData.xlsx

In [None]:
RNA_pDeath = pd.read_excel(unprocessed_data_location + 'allNcRNACelldeathData.xlsx', dtype={"geneid": "string"})
print(RNA_pDeath['RNA Category'].unique())
print(RNA_pDeath['Action_Mode'].unique())
print(RNA_pDeath['Pathway'].unique())
RNA_pDeath['Pathway'] = RNA_pDeath['Pathway'].replace({'necrosis': 'GO_0097300', 'autophagy': 'GO_0006914',
                                                       'apoptosis': 'GO_0006915'})
RNA_pDeath.Action_Mode = RNA_pDeath.Action_Mode.str.replace('updown', 'up,down')
RNA_pDeath.Action_Mode = RNA_pDeath.Action_Mode.str.replace('dowm', 'down')
RNA_pDeath.Action_Mode = RNA_pDeath.Action_Mode.str.replace('up ed', 'up')
RNA_pDeath.Action_Mode = RNA_pDeath.Action_Mode.str.split(",")
RNA_pDeath = RNA_pDeath.explode('Action_Mode')
RNA_pDeath.miRBase_ID = RNA_pDeath.miRBase_ID.str.split(",")
RNA_pDeath = RNA_pDeath.explode('miRBase_ID')
RNA_pDeath['RNA Category'] = RNA_pDeath['RNA Category'].str.strip()
RNA_pDeath['miRNA_symbol'] = RNA_pDeath['miRNA_symbol'].str.strip() # This (mislabeled) column is used for identifying lncRNAs and snoRNAs
RNA_pDeath['miRNA_symbol'] = RNA_pDeath['miRNA_symbol'].str.upper()
RNA_pDeath['miRBase_ID'] = RNA_pDeath['miRBase_ID'].str.strip()
RNA_pDeath['Gene_Symbol'] = RNA_pDeath['Gene_Symbol'].str.strip()
RNA_pDeath['Tissue'] = RNA_pDeath['Tissue'].str.strip()
RNA_pDeath = RNA_pDeath[RNA_pDeath.tax_id == 9606] # aggiungere le altre specie
RNA_pDeath.drop(columns=['Description','Description.1','tax_id','Organism','id','miRBase_mature_ID',
                         'geneid','Synonyms','Links','chromosome','map_location','type_of_gene','Full_name_from_nomenclature_authority',
                         'Other_designations'],inplace=True)
#Gene_Symbol è il gene  con cui l'RNA interagisce

RNA_pDeath['PMID'] = pd.to_numeric(RNA_pDeath['PMID'], errors='coerce')
RNA_pDeath['PMID'] = RNA_pDeath['PMID'].astype(str)
RNA_pDeath['PMID'] = RNA_pDeath['PMID'].str.replace(".0", "")
RNA_pDeath['PMID'] = RNA_pDeath['PMID'].replace("<NA>", np.nan)

RNA_pDeath['Tissue'] = RNA_pDeath['Tissue'].str.lower().str.split(";")
RNA_pDeath = RNA_pDeath.explode('Tissue')
RNA_pDeath = pd.merge(RNA_pDeath, location_map, right_on='0_y', left_on='Tissue', how='left')
RNA_pDeath['0_x'] = RNA_pDeath['0_x'].fillna(RNA_pDeath['Tissue'])
RNA_pDeath = RNA_pDeath.drop(columns=['0_y', 'Tissue'])
RNA_pDeath = RNA_pDeath.rename(columns={'0_x':'Location', 'PMID':'PubMedID'})

RNA_pDeath.head(n=3)

In [None]:
miRNA_pDeath = RNA_pDeath[RNA_pDeath['RNA Category'] == 'miRNA']
# These miRBase entries do no exist
print(miRNA_pDeath[~miRNA_pDeath['miRBase_ID'].isin(rnacentral_map_human_mirbase['miRBase ID'])]['miRBase_ID'].unique()[:5])
miRNA_pDeath = pd.merge(miRNA_pDeath, rnacentral_map_human_mirbase, left_on='miRBase_ID',
                        right_on='miRBase ID').drop(columns=['miRBase_ID','miRBase ID','RNA Category','miRNA_symbol'])
miRNA_pDeath.head(n=3)

In [None]:
RNA_pDeath = RNA_pDeath[RNA_pDeath['RNA Category'] != 'miRNA']
RNA_pDeath = pd.merge(RNA_pDeath, rnacentral_map_human_hgnc[['RNAcentral ID','HGNC symbol']].drop_duplicates(),
                      left_on='miRNA_symbol', right_on='HGNC symbol').drop(columns=['RNA Category','miRBase_ID','HGNC symbol','miRNA_symbol'])
RNA_pDeath.head(n=3)

In [None]:
RNA_pDeath = pd.concat([RNA_pDeath, miRNA_pDeath])
RNA_pDeath = RNA_pDeath.rename(columns={'RNAcentral ID':':START_ID','Pathway':':END_ID'})
RNA_pDeath['Source'] = 'ncRDeathDB'

In [None]:
RNA_pDeath_up = RNA_pDeath[RNA_pDeath['Action_Mode'] == 'up'].drop(columns='Action_Mode') # GO
RNA_pDeath_up.head(n=3)

* [Lnc2Cancer](http://bio-bigdata.hrbmu.edu.cn/lnc2cancer/index.html)

In [None]:
! wget http://bio-bigdata.hrbmu.edu.cn/lnc2cancer/download/circRNA.xlsx -O ../resources/processed_data/unprocessed_data/circRNA.xlsx

In [None]:
circRNA_disease2 = pd.read_excel(unprocessed_data_location+'circRNA.xlsx').drop(columns=[
                                 'sample','function description','year','title']) # circBase -- Mondo+HPO
#circRNA_disease2.drop(columns=['desc','cancer type'],inplace=True)
circRNA_disease2['name'] = "hsa_" + circRNA_disease2['name'].str.strip().str.lower()
circRNA_disease2 = circRNA_disease2[circRNA_disease2['name'].str.match(r'hsa_circ_\d+')]
circRNA_disease2['cancer type'] = circRNA_disease2['cancer type'].str.strip().str.lower()
circRNA_disease2['regulated'] = circRNA_disease2['regulated'].str.strip().str.lower() 
circRNA_disease2 = pd.merge(desc_disPhe_map.rename(columns={0:'cancer type'}), circRNA_disease2, on=['cancer type']).drop(
    columns=['cancer type']).rename(columns={1:'Disease'})

print(circRNA_disease2.regulated.unique())
circRNA_disease2.regulated = circRNA_disease2.regulated.str.replace("down-regulation", "down-regulated")
circRNA_disease2.regulated = circRNA_disease2.regulated.str.replace("expressed", "expression")
circRNA_disease2.regulated = circRNA_disease2.regulated.str.replace("ly", "")
print(circRNA_disease2.regulated.unique())

circRNA_disease2['pubmed id'] = pd.to_numeric(circRNA_disease2['pubmed id'], errors='coerce')
circRNA_disease2['pubmed id'] = circRNA_disease2['pubmed id'].astype(str)
circRNA_disease2['pubmed id'] = circRNA_disease2['pubmed id'].str.replace(".0", "")
circRNA_disease2['pubmed id'] = circRNA_disease2['pubmed id'].replace("<NA>", np.nan)

circRNA_disease2['methods'] = circRNA_disease2['methods'].str.replace("?", ' ')
circRNA_disease2['methods'] = circRNA_disease2['methods'].str.replace(" etc.", '')
circRNA_disease2['methods'] = circRNA_disease2['methods'].str.lower().str.split(", ")
circRNA_disease2 = circRNA_disease2.explode('methods')
circRNA_disease2 = pd.merge(circRNA_disease2, location_map, right_on='0_y', left_on='methods', how='left')
circRNA_disease2['0_x'] = circRNA_disease2['0_x'].fillna(circRNA_disease2['methods'])
circRNA_disease2 = circRNA_disease2.drop(columns=['0_y', 'methods'])
circRNA_disease2 = circRNA_disease2.rename(columns={'0_x':'Method'})

circRNA_disease2['Source'] = 'Lnc2Cancer'
circRNA_disease2.rename(columns={'name':':START_ID', 'Disease':':END_ID', 'pubmed id':'PubMedID'},inplace=True)
circRNA_disease2.head(n=3)

In [None]:
circRNA_disease2_up = circRNA_disease2[circRNA_disease2['regulated'] == 'up-regulated'] # circBase -- Mondo+HPO
circRNA_disease2_up = circRNA_disease2_up.drop(columns=['regulated'])
circRNA_disease2_up.head(n=3)

In [None]:
! wget http://bio-bigdata.hrbmu.edu.cn/lnc2cancer/download/lncRNA.xlsx -O ../resources/processed_data/unprocessed_data/lncRNA.xlsx 

In [None]:
lncRNA_disease2 = pd.read_excel(unprocessed_data_location+'lncRNA.xlsx').drop(columns=[
                                 'sample','function description','year','title']) # Mondo+HPO
#lncRNA_disease2.drop(columns=['desc','cancer type'],inplace=True)
lncRNA_disease2['name'] = lncRNA_disease2['name'].str.strip().str.upper()
lncRNA_disease2['cancer type'] = lncRNA_disease2['cancer type'].str.strip().str.lower()
lncRNA_disease2['regulated'] = lncRNA_disease2['regulated'].str.strip().str.lower() 
lncRNA_disease2 = pd.merge(desc_disPhe_map.rename(columns={0:'cancer type'}), lncRNA_disease2, on=['cancer type']).drop(
    columns=['cancer type']).rename(columns={1:'Disease'})
lncRNA_disease2 = pd.merge(lncRNA_disease2,
                           rnacentral_map_human_hgnc[['HGNC symbol', 'RNAcentral ID']].drop_duplicates().rename(
                               columns={'HGNC symbol':'name'}), on = 'name').drop(
                                   columns=['name']).rename(columns={'RNAcentral ID':'RNA'})

print(lncRNA_disease2.regulated.unique())
lncRNA_disease2.regulated = lncRNA_disease2.regulated.str.replace("down-regulation", "down-regulated")
lncRNA_disease2.regulated = lncRNA_disease2.regulated.str.replace("expressed", "expression")
lncRNA_disease2.regulated = lncRNA_disease2.regulated.str.replace("ly", "")
print(lncRNA_disease2.regulated.unique())

lncRNA_disease2['pubmed id'] = pd.to_numeric(lncRNA_disease2['pubmed id'], errors='coerce')
lncRNA_disease2['pubmed id'] = lncRNA_disease2['pubmed id'].astype(str)
lncRNA_disease2['pubmed id'] = lncRNA_disease2['pubmed id'].str.replace(".0", "")
lncRNA_disease2['pubmed id'] = lncRNA_disease2['pubmed id'].replace("<NA>", np.nan)

lncRNA_disease2['methods'] = lncRNA_disease2['methods'].str.replace("?", ' ')
lncRNA_disease2['methods'] = lncRNA_disease2['methods'].str.replace(" etc.", '')
lncRNA_disease2['methods'] = lncRNA_disease2['methods'].str.lower().str.split(", ")
lncRNA_disease2 = lncRNA_disease2.explode('methods')
lncRNA_disease2 = pd.merge(lncRNA_disease2, location_map, right_on='0_y', left_on='methods', how='left')
lncRNA_disease2['0_x'] = lncRNA_disease2['0_x'].fillna(lncRNA_disease2['methods'])
lncRNA_disease2 = lncRNA_disease2.drop(columns=['0_y', 'methods'])
lncRNA_disease2 = lncRNA_disease2.rename(columns={'0_x':'Method'})

lncRNA_disease2['Source'] = 'Lnc2Cancer'
lncRNA_disease2n = lncRNA_disease2.copy()
lncRNA_disease2n['Source'] = 'NONCODE'
lncRNA_disease2 = pd.concat([lncRNA_disease2, lncRNA_disease2n])
lncRNA_disease2.rename(columns={'RNA':':START_ID', 'Disease':':END_ID', 'pubmed id':'PubMedID'},inplace=True)
lncRNA_disease2.head(n=3)

In [None]:
lncRNA_disease2_up = lncRNA_disease2[lncRNA_disease2['regulated'] == 'up-regulated'] # Mondo+HPO
lncRNA_disease2_up = lncRNA_disease2_up.drop(columns=['regulated'])
lncRNA_disease2_up.head(n=3)

* LncRNAWiki

In [None]:
LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
lncRNA_disease = LncRNAWiki[LncRNAWiki['biological_context'].notna()]
lncRNA_disease = lncRNA_disease[lncRNA_disease.biological_context.str.contains('isease')]
lncRNA_disease = lncRNA_disease.drop(columns=['symbol','synonyms','gene_locus','gene_id','conservation_ortholog','clinical_detail',
                                        'modification_detail','target_interaction', 'conservation_species','target_effect','epigenetic_modification',
                                        'description','conservation','target_type','biological_context','regulator_effect','biological_process','pathway',
                                        'regulator_interaction', 'genome_variation', 'variation_detail', 'molecular_function',
                                        'expression','regulator_type','functional_mechanism'])

lncRNA_disease['transcript_id'] = lncRNA_disease['transcript_id'].str.split(',')
lncRNA_disease = lncRNA_disease.explode('transcript_id')
lncRNA_disease = lncRNA_disease[lncRNA_disease['transcript_id'].notna()]

lncRNA_disease['context_detail'] = lncRNA_disease['context_detail'].str.split(',')
lncRNA_disease = lncRNA_disease.explode('context_detail')
lncRNA_disease = lncRNA_disease[lncRNA_disease['context_detail'].notna()]

lncRNA_disease = pd.merge(desc_disPhe_map.rename(columns={0:'context_detail'}), lncRNA_disease, on=['context_detail']).drop(
    columns=['context_detail']).rename(columns={1:'Disease'})
lncRNA_disease = pd.merge(lncRNA_disease, rnacentral_map_human_lncbook[['LncBook Transcript ID', 'RNAcentral ID']].drop_duplicates().rename(
    columns={'LncBook Transcript ID':'transcript_id'}), on = 'transcript_id').drop(columns=['transcript_id']).rename(
        columns={'RNAcentral ID':'RNA'})  

lncRNA_disease = lncRNA_disease.fillna('nan')
lncRNA_disease.expression_detail = lncRNA_disease.expression_detail.str.split(';')
lncRNA_disease = lncRNA_disease.explode('expression_detail')
print(lncRNA_disease.expression_detail.unique())

lncRNA_disease = lncRNA_disease[lncRNA_disease['RNA'].notna()]
lncRNA_disease = lncRNA_disease[lncRNA_disease['Disease'].notna()]

lncRNA_disease['pmid'] = pd.to_numeric(lncRNA_disease['pmid'], errors='coerce')
lncRNA_disease['pmid'] = lncRNA_disease['pmid'].astype(str)
lncRNA_disease['pmid'] = lncRNA_disease['pmid'].str.replace(".0", "")
lncRNA_disease['pmid'] = lncRNA_disease['pmid'].replace("<NA>", np.nan)

lncRNA_disease['drug'] = lncRNA_disease['drug'].str.lower().str.split(";")
lncRNA_disease = lncRNA_disease.explode('drug')

lncRNA_disease['regulator'] = lncRNA_disease['regulator'].str.lower().str.split(";")
lncRNA_disease = lncRNA_disease.explode('regulator')
lncRNA_disease['target'] = lncRNA_disease['target'].str.lower().str.split(";")
lncRNA_disease = lncRNA_disease.explode('target')

lncRNA_disease['experimental_method'] = lncRNA_disease['experimental_method'].str.lower().str.split(";")
lncRNA_disease = lncRNA_disease.explode('experimental_method')
lncRNA_disease = pd.merge(lncRNA_disease, method_map, right_on='0_y', left_on='experimental_method', how='left')
lncRNA_disease['0_x'] = lncRNA_disease['0_x'].fillna(lncRNA_disease['experimental_method'])
lncRNA_disease = lncRNA_disease.drop(columns=['0_y', 'experimental_method'])
lncRNA_disease = lncRNA_disease.rename(columns={'0_x':'Method','pmid':'PubMedID', 'drug':'Drug', 'regulator':'Regulator', 'target':'Interactor'})

lncRNA_disease['tissue/cell line'] = lncRNA_disease['tissue/cell line'].str.lower().str.split(";")
lncRNA_disease = lncRNA_disease.explode('tissue/cell line')
lncRNA_disease = pd.merge(lncRNA_disease, location_map, right_on='0_y', left_on='tissue/cell line', how='left')
lncRNA_disease['0_x'] = lncRNA_disease['0_x'].fillna(lncRNA_disease['tissue/cell line'])
lncRNA_disease = lncRNA_disease.drop(columns=['0_y', 'tissue/cell line'])
lncRNA_disease = lncRNA_disease.rename(columns={'0_x':'Location'})

lncRNA_disease['Source'] = 'LncRNAWiki'
lncRNA_disease2 = lncRNA_disease.copy()
lncRNA_disease2['Source'] = 'NONCODE'
lncRNA_disease = pd.concat([lncRNA_disease, lncRNA_disease2])
lncRNA_disease.rename(columns={'RNA':':START_ID', 'Disease':':END_ID'},inplace=True)
lncRNA_disease.head(n=3)

In [None]:
lncRNA_disease_up = lncRNA_disease[lncRNA_disease['expression_detail'] == 'Up-regulated'] # Mondo+HPO
lncRNA_disease_up = lncRNA_disease_up.drop(columns=['expression_detail'])
lncRNA_disease_up.head(n=3)

* [TANRIC](https://www.tanric.org/) <br /> TANRIC characterizes the expression profiles of lncRNAs in large patient cohorts of 20 cancer types.

In [None]:
# https://ibl.mdanderson.org/tanric/_design/basic/download.html --> Select all --> Download button --> place them in a folder called tanric
import zipfile
for filename in os.listdir(unprocessed_data_location + "tanric/"):
    if filename.endswith(".zip"):
        with zipfile.ZipFile(os.path.join(unprocessed_data_location + "tanric/", filename), "r") as zip_ref:
            zip_ref.extract(filename[:-4] + ".tsv", unprocessed_data_location + "tanric/")

In [None]:
df = []

for tsv_file in os.listdir(unprocessed_data_location + "tanric/"): # Mondo+HPO
    if tsv_file.endswith(".tsv"):
        lncRNA_disease2 = pd.read_csv(unprocessed_data_location + "tanric/" + tsv_file, sep="\t")
        lncRNA_disease2['Number_of_experiments'] = lncRNA_disease2.drop(columns=['Gene_ID'])[lncRNA_disease2.drop(
            columns=['Gene_ID']) >= 0.3].count(axis=1)
        lncRNA_disease2['Mean_value_across_experiments'] = np.mean(lncRNA_disease2.drop(columns=['Gene_ID']), axis=1)
        processed_df = lncRNA_disease2[['Gene_ID','Mean_value_across_experiments','Number_of_experiments']]
        processed_df['Disease'] = tsv_file
        processed_df = processed_df[processed_df['Mean_value_across_experiments'] > 0.3]
        processed_df = processed_df[processed_df['Number_of_experiments'] > 0]
        df.append(processed_df)

lncRNA_disease_up_3 = pd.concat(df)
lncRNA_disease_up_3.Gene_ID = lncRNA_disease_up_3.Gene_ID.str.split('.').str[0]
lncRNA_disease_up_3.head(n=3)

In [None]:
lncRNA_disease_up_3['Disease'] = lncRNA_disease_up_3['Disease'].str.replace("-rnaexpr.tsv","")
lncRNA_disease_up_3['Disease'] = lncRNA_disease_up_3['Disease'].str.replace("TCGA-","")
lncRNA_disease_up_3['Disease'] = lncRNA_disease_up_3['Disease'].str.replace("OTHER-","")
lncRNA_disease_up_3['Disease'] = lncRNA_disease_up_3['Disease'].str.replace("_CHINA","")
lncRNA_disease_up_3['Disease'] = lncRNA_disease_up_3['Disease'].str.replace("_JAPAN","")
lncRNA_disease_up_3['Disease'] = lncRNA_disease_up_3['Disease'].str.replace("_KOREA","")

lncRNA_disease_up_3 = pd.merge(lncRNA_disease_up_3, cancer_mondo_map.rename(columns={0:'Disease'}), on='Disease').drop(
    columns=['Disease']).rename(columns={1:'Disease'})
lncRNA_disease_up_3 = pd.merge(lncRNA_disease_up_3, rnacentral_map_human_ensembl[[
    'RNAcentral ID', 'Ensembl Gene ID']].drop_duplicates().rename(columns={'Ensembl Gene ID':'Gene_ID'}), on='Gene_ID').drop(
    columns=['Gene_ID']).rename(columns={'RNAcentral ID':'RNA'})
lncRNA_disease_up_3 = lncRNA_disease_up_3.rename(columns={'RNA':':START_ID','Disease':':END_ID','Mean_value_across_experiments':'TANRIC_score'})
lncRNA_disease_up_3['Source'] = 'TANRIC'
lncRNA_disease_up_3.head(n=3)

In [None]:
RNA_over_expressed_in_OBO = pd.concat([df_up, pirna_disease_list, lncRNA_pw_up, HCfinal, dbdemc_up,
                                        mirnet_up, mir2disease_up, hmdd_up, mircancer_up,
                                        RNA_pDeath_up, circRNA_disease2_up, lncRNA_disease2_up,
                                        lncRNA_disease_up, lncRNA_disease_up_3])

RNA_over_expressed_in_OBO = RNA_over_expressed_in_OBO.groupby([':START_ID',':END_ID']).agg({'PubMedID':set,'Source':set,'Fold_Change':np.mean,
                                                                                            'Regulator':set,'Interactor':set,'Drug':set,
                                                                                            'Method':set,'Location':set,'FDR':np.mean,
                                                                                            'TANRIC_score':np.mean}).reset_index()

RNA_over_expressed_in_OBO[':TYPE'] = 'over_expressed_in'
RNA_over_expressed_in_OBO.to_pickle(unprocessed_edge_data_location+'RNA_over_expressed_in_OBO.pkl')
RNA_over_expressed_in_OBO.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002206 (expressed in) - OBO

* [Lnc2Cancer](http://bio-bigdata.hrbmu.edu.cn/lnc2cancer/index.html) <br /> Lnc2Cancer is a manually curated database that provides comprehensive experimentally supported associations between lncRNA or circRNA and human cancer.

In [None]:
! wget http://bio-bigdata.hrbmu.edu.cn/lnc2cancer/download/circRNA.xlsx -O ../resources/processed_data/unprocessed_data/circRNA.xlsx

In [None]:
circRNA_disease2_de = circRNA_disease2[circRNA_disease2['regulated'] == 'differential expression']
circRNA_disease2_de = circRNA_disease2_de.drop(columns=['regulated'])
circRNA_disease2_de.head(n=3)

In [None]:
! wget http://bio-bigdata.hrbmu.edu.cn/lnc2cancer/download/lncRNA.xlsx -O ../resources/processed_data/unprocessed_data/lncRNA.xlsx 

In [None]:
lncRNA_disease2_de = lncRNA_disease2[lncRNA_disease2['regulated'] == 'differential expression']
lncRNA_disease2_de = lncRNA_disease2_de.drop(columns=['regulated'])
lncRNA_disease2_de.head(n=3)

* LncRNAWiki

In [None]:
lncRNA_disease_de = lncRNA_disease[(lncRNA_disease['expression_detail'] == 'Differentially expressed')]
lncRNA_disease_de = lncRNA_disease_de.drop(columns=['expression_detail'])
lncRNA_disease_de.head(n=3)

In [None]:
lncRNA_pw_expressed = lncRNA_pw[(lncRNA_pw['expression_detail'] == 'Differentially expressed')].drop(columns=['expression_detail']) # PW
lncRNA_pw_expressed.head(n=3)

* miRNet

In [None]:
mirnet_normal = mirnet[mirnet['evidence'].str.contains('normal')].drop(columns=['evidence']) # Mondo+HPO
mirnet_normal.head(n=3)

* [miR2Disease](http://watson.compbio.iupui.edu:8080/miR2Disease/)

In [None]:
mir2disease_normal = mir2disease[mir2disease[2] == 'normal'].drop(columns=[2])
mir2disease_normal.head(n=3)

In [None]:
RNA_expressed_in_OBO = pd.concat([lncRNA_pw_expressed, mirnet_normal, mir2disease_normal,
                                  lncRNA_disease_de, lncRNA_disease2_de, circRNA_disease2_de])
RNA_expressed_in_OBO = RNA_expressed_in_OBO.groupby([':START_ID',':END_ID']).agg({'PubMedID':set,'Source':set,
                                                                                  'Regulator':set,'Interactor':set,'Drug':set,
                                                                                  'Method':set,'Location':set
                                                                                  }).reset_index()
RNA_expressed_in_OBO[':TYPE'] = 'expressed_in'
RNA_expressed_in_OBO.to_pickle(unprocessed_edge_data_location+'RNA_expressed_in_OBO.pkl')
RNA_expressed_in_OBO.head(n=3)

In [None]:
OBO_expresses_RNA = RNA_expressed_in_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_expresses_RNA[':TYPE'] = 'expresses'
OBO_expresses_RNA.to_pickle(unprocessed_edge_data_location+'OBO_expresses_RNA.pkl')
OBO_expresses_RNA.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002246 (under-expressed in) - OBO

* [piRBase](http://bigdata.ibp.ac.cn/piRBase/) <br /> piRBase is a database supporting piRNA functional study.

In [None]:
df = pd.DataFrame()

directory = unprocessed_data_location + "piRBase_disease/"
for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    temp = pd.read_html(filepath)[1]
    df = pd.concat([df, temp])

df.Disease = df.Disease.str.lower()
df.Subtype = df.Subtype.str.lower()
df.Name = df.Name.str.strip()
df['Source'] = 'piRBase'
df = df[['Name', 'Disease', 'Subtype', 'Expression', 'Function', 'PubMed', 'Source']].drop(columns=['Function']) # Mondo+HPO
df = df[~df['Name'].isna()]
df = df[df['Name'].str.startswith('piR-hsa')]
df['Subtype'] = df['Subtype'].fillna(df['Disease'])
df = df.drop(columns=['Disease'])
#df.PubMed = 'https://pubmed.ncbi.nlm.nih.gov/' + df.PubMed.astype('Int64').astype('str')
df = df.rename(columns = {'Subtype':'Disease'})

df = pd.merge(df, rnacentral_map_human_pirbase.rename(columns={'piRBase ID':'Name'}), on='Name').drop(
    columns=['Name']).rename(columns={'RNAcentral ID':'RNA'})

df.Disease = df.Disease.replace('heart failure', 'MONDO_0005252')
df.Disease = df.Disease.replace('cardiac hypertrophy', 'HP_0001714')
df.Disease = df.Disease.replace('multiple myeloma', 'HP_0006775')
df.Disease = df.Disease.replace('gastric cancer', 'MONDO_0001056')
df.Disease = df.Disease.replace('breast cancer', 'MONDO_0007254')
df.Disease = df.Disease.replace('liver cancer', 'MONDO_0002691')
df.Disease = df.Disease.replace('myeloma', 'MONDO_0005170')
df.Disease = df.Disease.replace('bladder cancer', 'MONDO_0004986')
df.Disease = df.Disease.replace('colorectal cancer', 'MONDO_0005575')
df.Disease = df.Disease.replace('pancreas cancer', 'MONDO_0005192')
df.Disease = df.Disease.replace('kidney cancer', 'MONDO_0002367')
df.Disease = df.Disease.replace('cardiovascular diseases', 'MONDO_0004995')
df.Disease = df.Disease.replace('alzheimer', 'MONDO_0004975')
df.Disease = df.Disease.replace('thyroid cancer', 'MONDO_0002108')
df.Disease = df.Disease.replace('lung cancer', 'MONDO_0008903')
df.Disease = df.Disease.replace('prostate cancer', 'MONDO_0008315')
df.Disease = df.Disease.replace('parkinson', 'MONDO_0005180')
df.Disease = df.Disease.replace('glioblastoma', 'MONDO_0018177')
df.Disease = df.Disease.replace('ovarian cancer', 'MONDO_0008170')

print(df.Expression.unique())
df[['Regulation', 'Fold_Change']] = df['Expression'].str.extract(r'(up-regulated|down-regulated)\s*FC\s*(\d*\.?\d+)?')
df['Fold_Change'] = pd.to_numeric(df['Fold_Change'], errors='coerce')
df.drop(columns=['Expression'], inplace=True)

df['PubMed'] = pd.to_numeric(df['PubMed'], errors='coerce')
df['PubMed'] = df['PubMed'].astype(str)
df['PubMed'] = df['PubMed'].str.replace(".0", "")
df['PubMed'] = df['PubMed'].replace("<NA>", np.nan)

df.head(n=3)

In [None]:
df_down = df[df['Regulation'] == 'down-regulated']
df_down.rename(columns={'RNA':':START_ID', 'Disease':':END_ID'}, inplace=True)
df_down.head(n=3)

* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
lncRNA_pw_down = lncRNA_pw[lncRNA_pw['expression_detail'] == 'Down-regulated'].drop(columns=['expression_detail']) # PW
lncRNA_pw_down.head(n=3)

In [None]:
lncRNA_disease_down = lncRNA_disease[lncRNA_disease['expression_detail'] == 'Down-regulated']
lncRNA_disease_down = lncRNA_disease_down.drop(columns=['expression_detail'])
lncRNA_disease_down.head(n=3)

* LncBook

In [None]:
LCfinal.head(n=3) # Mondo+HPO

* miRNet

In [None]:
mirnet_down = mirnet[(mirnet['evidence'].str.contains('down'))].drop(columns=['evidence']) # Mondo+HPO
mirnet_down.head(n=3)

* [miR2Disease](http://watson.compbio.iupui.edu:8080/miR2Disease/)

In [None]:
mir2disease_down = mir2disease[mir2disease[2] == 'down-regulated'].drop(columns=[2])
mir2disease_down.head(n=3)

* [HMDD](https://www.cuilab.cn/hmdd)

In [None]:
hmdd_down = hmdd[(hmdd['code'] == 'tissue_expression_down') | (hmdd['code'] == 'genetics_knock down_suppress')].drop(columns=['code'])
hmdd_down.head(n=3)

* dbDEMC

In [None]:
dbdemc_down = dbdemc[dbdemc['Status'] == 'DOWN'].drop(columns=['Status'])
dbdemc_down.head(n=3)

* miRCancer

In [None]:
mircancer_down = mircancer[mircancer['Profile'].str.contains('down')].drop(columns=['Profile'])
mircancer_down.head(n=3)

* [ncRDeathDB](https://www.rna-society.org/ncrdeathdb/) <br/> ncRDeathDB includes ncRNA types associated with apoptosis, autophagy, and necrosis.

In [None]:
! wget -O https://www.rna-society.org/ncrdeathdb/data/allNcRNACelldeathData.xlsx -P ../resources/processed_data/unprocessed_data/

In [None]:
RNA_pDeath_down = RNA_pDeath[RNA_pDeath['Action_Mode'] == 'down'].drop(columns='Action_Mode') # GO
RNA_pDeath_down.head(n=3)

* [Lnc2Cancer](http://bio-bigdata.hrbmu.edu.cn/lnc2cancer/index.html) <br /> Lnc2Cancer is a manually curated database that provides comprehensive experimentally supported associations between lncRNA or circRNA and human cancer.

In [None]:
circRNA_disease2_down = circRNA_disease2[circRNA_disease2['regulated'] == 'down-regulated'] # circBase -- Mondo+HPO
circRNA_disease2_down = circRNA_disease2_down.drop(columns=['regulated'])
circRNA_disease2_down.head(n=3)

In [None]:
lncRNA_disease2_down = lncRNA_disease2[lncRNA_disease2['regulated'] == 'down-regulated'] # Mondo+HPO
lncRNA_disease2_down = lncRNA_disease2_down.drop(columns=['regulated'])
lncRNA_disease2_down.head(n=3)

In [None]:
RNA_expressed_in_OBO = pd.concat([df_down, lncRNA_pw_down, LCfinal, mirnet_down, dbdemc_down,
                                  mir2disease_down, hmdd_down, mircancer_down,
                                  lncRNA_disease_down, lncRNA_disease2_down,
                                  circRNA_disease2_down, RNA_pDeath_down])

RNA_expressed_in_OBO = RNA_expressed_in_OBO.groupby([':START_ID',':END_ID']).agg({'PubMedID':set,'Source':set,'Fold_Change':np.mean,
                                                                                            'Regulator':set,'Interactor':set,'Drug':set,
                                                                                            'Method':set,'Location':set,'FDR':np.mean}).reset_index()

RNA_expressed_in_OBO[':TYPE'] = 'under_expressed_in'
RNA_expressed_in_OBO.to_pickle(unprocessed_edge_data_location+'RNA_under_expressed_in_OBO.pkl')
RNA_expressed_in_OBO.head(n=3)

***
### Gene - http://purl.obolibrary.org/obo/RO_0003302 (causes or contributes to condition) - OBO

* CTD

In [None]:
data_downloader("https://ctdbase.org/reports/CTD_curated_genes_diseases.tsv.gz", unprocessed_data_location)

In [None]:
ctd_gene_disease = pd.read_csv(unprocessed_data_location+'CTD_curated_genes_diseases.tsv', sep='\t', comment="#",
                          names=['GeneSymbol','GeneID','DiseaseName','DiseaseID','OmimIDs','PubMedIDs'])

ctd_gene_disease = ctd_gene_disease[['GeneID','DiseaseID','PubMedIDs']]
ctd_gene_disease['DiseaseID'] = ctd_gene_disease['DiseaseID'].str.replace('MESH:', '')

ctd_gene_disease['PubMedIDs'] = pd.to_numeric(ctd_gene_disease['PubMedIDs'], errors='coerce')
ctd_gene_disease['PubMedIDs'] = ctd_gene_disease['PubMedIDs'].astype(str)
ctd_gene_disease['PubMedIDs'] = ctd_gene_disease['PubMedIDs'].str.replace(".0", "")
ctd_gene_disease['PubMedIDs'] = ctd_gene_disease['PubMedIDs'].replace("<NA>", np.nan)

ctd_gene_disease = pd.merge(ctd_gene_disease, disgenet_mondo_hpo_map.rename(columns={0:'DiseaseID'}), on='DiseaseID')
ctd_gene_disease['Source'] = "CTD"
ctd_gene_disease2 = ctd_gene_disease.copy()
ctd_gene_disease['Source'] = "DisGeNET"
ctd_gene_disease = pd.concat([ctd_gene_disease, ctd_gene_disease2])
ctd_gene_disease[['GeneID','PubMedIDs',1,'Source']].head(n=3)

In [None]:
ctd_gene_disease = ctd_gene_disease[['GeneID','PubMedIDs',1,'Source']].rename(columns={1:':END_ID','GeneID':':START_ID','PubMedIDs':'PubMedID'})
ctd_gene_disease = ctd_gene_disease.groupby([':START_ID',':END_ID']).agg({'PubMedID':set,"Source":set}).reset_index()

ctd_gene_disease[':TYPE'] = 'causes_or_contributes_to_condition'
ctd_gene_disease.to_pickle(unprocessed_edge_data_location+'gene_causes_or_contributes_to_condition_OBO.pkl')
ctd_gene_disease.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0003302 (causes or contributes to condition) - OBO


* [RNADisease](http://www.rnadisease.org/) <br/> RNADisease includes literature-verified RNA-disease interaction entries and uses a variety of algorithms to obtain a large amount of prediction RNA-disease data.

In [None]:
!wget http://www.rnadisease.org/static/download/RNADiseasev4.0_RNA-disease_experiment_all.zip -O ../resources/processed_data/unprocessed_data/RNADiseasev4.0_RNA-disease_experiment_all.zip
import zipfile
with zipfile.ZipFile(unprocessed_data_location+'RNADiseasev4.0_RNA-disease_experiment_all.zip', 'r') as zip_ref: # Mondo+HPO
    zip_ref.extractall(unprocessed_data_location)

In [None]:
RNA_disease = pd.read_excel(unprocessed_data_location+'RNADiseasev4.0_RNA-disease_experiment_all.xlsx')
# We select only strong evidence interactions for hsa
RNA_disease = RNA_disease[RNA_disease['specise'].str.contains('apiens')]
# We keep only entries score is >= 0.95 (see http://www.rnadisease.org/help Q10)
RNA_disease = RNA_disease[RNA_disease['score']>=0.95]

RNA_disease = RNA_disease[(RNA_disease['DO ID'].notna())]
RNA_disease['DO ID'] = RNA_disease['DO ID'].str.replace(':','_')
RNA_disease['RNA Type'] = RNA_disease['RNA Type'].str.replace(' ','_')
RNA_disease = pd.merge(RNA_disease, doid_mondo_map.rename(columns={0:'DO ID'}), on=['DO ID'])
RNA_disease['RNA Symbol'] = RNA_disease['RNA Symbol'].str.split('.').str[0]

RNA_disease[1] = RNA_disease[1].str.split(',')
RNA_disease = RNA_disease.explode(1)
RNA_disease.drop(columns=['RDID','specise','Disease Name', 'MeSH ID','KEGG disease ID','DO ID'],inplace=True)

RNA_disease['PMID'] = pd.to_numeric(RNA_disease['PMID'], errors='coerce')
RNA_disease['PMID'] = RNA_disease['PMID'].astype(str)
RNA_disease['PMID'] = RNA_disease['PMID'].str.replace(".0", "")
RNA_disease['PMID'] = RNA_disease['PMID'].replace("<NA>", np.nan)

RNA_disease.head(n=3)

In [None]:
RNA_disease['RNA Type'].unique()

In [None]:
ensembl_map['ensembl_transcript_type'].unique()

In [None]:
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("transcribed_unitary_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("transcribed_unprocessed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("transcribed_processed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("unprocessed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("processed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("IG_V_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("unitary_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("TR_J_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("TR_V_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("IG_C_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("IG_J_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("translated_processed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("IG_pseudogene","pseudo")

In [None]:
pseudo_disease = RNA_disease[RNA_disease['RNA Type'] == 'pseudo']
ensembl_map_mrna = ensembl_map[['transcript_stable_id','ensembl_transcript_type', "symbol"]]
ensembl_map_mrna = ensembl_map_mrna[ensembl_map_mrna['ensembl_transcript_type'] == 'protein_coding'].drop_duplicates()
pseudo_disease = pd.merge(pseudo_disease, ensembl_map_mrna, left_on='RNA Symbol',right_on='symbol')
pseudo_disease.drop(columns=['RNA Symbol','RNA Type','symbol','ensembl_transcript_type'],inplace=True)

pseudo_disease['Source'] = 'RNADisease'
pseudo_disease.head(n=3)

In [None]:
mRNA_disease = RNA_disease[RNA_disease['RNA Type'] == 'mRNA']
ensembl_map_mrna = ensembl_map[['transcript_stable_id','ensembl_transcript_type', "symbol"]]
ensembl_map_mrna = ensembl_map_mrna[ensembl_map_mrna['ensembl_transcript_type'] == 'protein_coding'].drop_duplicates()
mRNA_disease = pd.merge(mRNA_disease, ensembl_map_mrna, left_on='RNA Symbol',right_on='symbol')
mRNA_disease.drop(columns=['RNA Symbol','RNA Type','symbol','ensembl_transcript_type'],inplace=True)

mRNA_disease['Source'] = 'RNADisease'
mRNA_disease.head(n=3)

In [None]:
RNAdisease = RNA_disease[(RNA_disease['RNA Type'] != 'mRNA') & (RNA_disease['RNA Type'] != 'pseudo')]

In [None]:
print(RNAdisease[RNAdisease['RNA Type'] == 'miRNA']['RNA Symbol'].str[:5].unique())
RNAdisease['RNA Symbol'] = RNAdisease['RNA Symbol'].str.replace("EBV-", "ebv-")
RNAdisease['RNA Symbol'] = RNAdisease['RNA Symbol'].str.replace("MCV-", "mcv-")

In [None]:
RNAdisease = pd.merge(RNAdisease, rnacentral_map_mirbase[['Label', 'RNAcentral ID']].drop_duplicates(), left_on=['RNA Symbol'],
                   right_on=['Label'], how="left").drop(columns=["Label"])

RNAdisease['RNAcentral ID'] = RNAdisease['RNAcentral ID'].fillna(RNAdisease['RNA Symbol'])
RNAdisease.drop(columns=['RNA Symbol'],inplace=True)
RNAdisease.rename(columns={'RNAcentral ID':'RNA Symbol'},inplace=True)
RNAdisease.head(n=3)

In [None]:
RNAdisease = pd.merge(RNAdisease, rnacentral_map_human_pirbase[['piRBase ID', 'RNAcentral ID']].drop_duplicates(), left_on=['RNA Symbol'],
                   right_on=['piRBase ID'], how="left").drop(columns=["piRBase ID"])

RNAdisease['RNAcentral ID'] = RNAdisease['RNAcentral ID'].fillna(RNAdisease['RNA Symbol'])
RNAdisease.drop(columns=['RNA Symbol'],inplace=True)
RNAdisease.rename(columns={'RNAcentral ID':'RNA Symbol'},inplace=True)
RNAdisease.head(n=3)

In [None]:
RNAdisease = pd.merge(RNAdisease,rnacentral_map_human_hgnc[['RNAcentral ID','HGNC symbol','RNA category']].drop_duplicates(),
                      left_on=['RNA Symbol', 'RNA Type'],
                   right_on=['HGNC symbol', 'RNA category'], how="left").drop(columns=['RNA Type',"HGNC symbol"])

RNAdisease['RNAcentral ID'] = RNAdisease['RNAcentral ID'].fillna(RNAdisease['RNA Symbol'])
RNAdisease.drop(columns=['RNA Symbol'],inplace=True)
RNAdisease.rename(columns={'RNAcentral ID':':START_ID', 1:':END_ID'},inplace=True)
RNAdisease.head(n=3)

In [None]:
RNAdisease = pd.concat([RNAdisease, mRNA_disease, pseudo_disease])
RNAdisease = RNAdisease[(RNAdisease[':START_ID'].str.startswith("ENST")) |
                         (RNAdisease[':START_ID'].str.startswith("URS")) |
                           (RNAdisease[':START_ID'].str.startswith("hsa_circ"))]

RNAdisease.rename(columns={'score':'RNAsister_score','PMID':'PubMedID'},inplace=True)
RNAdisease.drop(columns=['RNA category',1,'transcript_stable_id'],inplace=True)

RNAdisease['Source'] = 'RNADisease'
RNAdisease.head(n=3)

* [tsRFun](https://rna.sysu.edu.cn/tsRFun/index.php)

In [None]:
!wget -O https://rna.sysu.edu.cn/tsRFun/download/tsRinCancer/allCancer_0.txt -P ../resources/processed_data/unprocessed_data/allCancer_0.txt

In [None]:
tsRNA_disease = pd.read_csv(unprocessed_data_location + 'allCancer_0.txt', sep="\t", index_col=0) # Mondo+HPO

# We keep only log2FC columns
tsRNA_disease = tsRNA_disease.loc[:, tsRNA_disease.columns.str.endswith('_log2FC')]
tsRNA_disease.columns = tsRNA_disease.columns.str.replace(r'_log2FC', '')
# tsRNA has a relationship with cancer iff |log2FC| >= 1
tsRNA_disease[abs(tsRNA_disease) < 1] = 0

tsRNA_disease.head(n=3)

In [None]:
# We want a dataframe with 2 columns, tRF and associated cancer;
# this is an example with ACC 
tRF=[]
log2FC=[]
for index, row in tsRNA_disease.iterrows():
     if row['ACC'] != 0:
            tRF.append(index)
            log2FC.append(row['ACC'])
            
df_acc = pd.DataFrame (tRF, columns = ['tRF'])
df_acc['dis'] = 'ACC'
df_acc['log2FC'] = log2FC
df_acc.head(n=3)

In [None]:
# Empty dataframe to store processed rows
trRF_disease = pd.DataFrame(columns = ["tRF", "dis"])

log2FC=[]
for cancer in tsRNA_disease.columns:    
    tRF=[]
    for index, row in tsRNA_disease.iterrows():
         if row[cancer] != 0:
            tRF.append(index)
            log2FC.append(row[cancer])
    
    df = pd.DataFrame(tRF, columns = ['tRF'])
    df['dis'] = cancer
    
    trRF_disease = pd.concat([trRF_disease, df], ignore_index=True)
    
trRF_disease['log2FC'] = log2FC
trRF_disease.head(n=3)

In [None]:
trRF_disease = pd.merge(trRF_disease, cancer_mondo_map.rename(columns={0:'dis'}), on='dis')
trRF_disease.drop(columns=['dis'],inplace=True)
trRF_disease.rename(columns={1:'Disease'}, inplace=True)
trRF_disease['Source'] = 'tsRFun'

trRF_disease.rename(columns={'tRF':':START_ID','Disease':':END_ID'},inplace=True)
trRF_disease.head(n=3)

* miRNet

In [None]:
mirnet_causes = mirnet[(~mirnet['evidence'].str.contains('down')) & (~mirnet['evidence'].str.contains('over'))]
mirnet_causes.head(n=3)

* [HMDD](https://www.cuilab.cn/hmdd)

In [None]:
hmdd_causes = hmdd[(hmdd['code'] != 'tissue_expression_up') & (hmdd['code'] != 'tissue_expression_down')  &
                   (hmdd['code'] != 'genetics_knock down_suppress')]
hmdd_causes.head(n=3)

* [TAM](http://www.lirmed.com/tam2/)

In [None]:
with open(unprocessed_data_location+'mirset_v9.txt', 'r') as file: # Mondo+HPO
    data = file.read().rstrip()
    
TAM = pd.DataFrame([ ln.rstrip().split('\t') for ln in
    io.StringIO(data).readlines() ]).fillna('')

TAM = TAM.dropna(axis=1, how='all')
miRNA_disease2 = TAM[TAM[0]==("HMDD")]
miRNA_disease2[1] = miRNA_disease2[1].str.lower()
miRNA_disease2 = miRNA_disease2.dropna(axis=1, how='all')
miRNA_disease2 = miRNA_disease2.drop(columns=[0])
miRNA_disease2.head(n=3)

In [None]:
miRNA_disease2['merged'] = miRNA_disease2[miRNA_disease2.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
miRNA_disease2 = miRNA_disease2[[1,'merged']]

miRNA_disease2['merged'] = miRNA_disease2.merged.str.split(',')
miRNA_disease2 = miRNA_disease2.explode('merged')
miRNA_disease2.rename(columns={1: 'disease', 'merged': 'mir_id'}, inplace=True)

miRNA_disease2 = miRNA_disease2.merge(desc_disPhe_map, left_on='disease', right_on=0).drop(
    columns=[0,'disease']).rename(columns={1:'Disease'})

print(all(miRNA_disease2['mir_id'].isin(rnacentral_map_human['DB Description'])))
miRNA_RNA_miRNAnotInRNAcentral3p = miRNA_disease2[~miRNA_disease2['mir_id'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = miRNA_disease2[~miRNA_disease2['mir_id'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['mir_id'] = miRNA_RNA_miRNAnotInRNAcentral5p['mir_id'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['mir_id'] = miRNA_RNA_miRNAnotInRNAcentral3p['mir_id'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'mir_id'}), on='mir_id').drop(columns=['mir_id']).rename(columns={'RNAcentral ID':'RNA'})

miRNA_disease2 = pd.merge(miRNA_disease2, rnacentral_map_human.rename(
    columns={'DB Description':'mir_id'}), on='mir_id').drop(columns=['DB','DB ID','mir_id','Organism','RNA category']).rename(columns={'RNAcentral ID':'RNA'})
miRNA_disease2 = pd.concat([miRNA_disease2, miRNA_RNA_miRNAnotInRNAcentral]).drop(columns=['DB','DB ID','Organism','RNA category'])

miRNA_disease2['Source'] = 'TAM' 
miRNA_disease2.head(n=3)

* [miRcancer](http://mircancer.ecu.edu/) <br /> miRCancer provides comprehensive collection of miRNA expression profiles in various human cancers which are automatically extracted from published literatures in PubMed.

In [None]:
mircancer_causes = mircancer[(~mircancer['Profile'].str.contains("down")) &
                             (~mircancer['Profile'].str.contains("up"))].drop(columns=['Profile'])
mircancer_causes.head(n=3)

* [Lnc2Cancer](http://bio-bigdata.hrbmu.edu.cn/lnc2cancer/index.html) <br /> Lnc2Cancer is a manually curated database that provides comprehensive experimentally supported associations between lncRNA or circRNA and human cancer.

In [None]:
circRNA_disease2_causes = circRNA_disease2[circRNA_disease2['regulated'] == 'nan'] # Mondo+HPO  
circRNA_disease2_causes = circRNA_disease2_causes.drop(columns=['regulated'])
circRNA_disease2_causes.head(n=3)

In [None]:
lncRNA_disease2_causes = lncRNA_disease2[lncRNA_disease2['regulated'] == 'nan'] # Mondo+HPO  
lncRNA_disease2_causes = lncRNA_disease2_causes.drop(columns=['regulated'])
lncRNA_disease2_causes.head(n=3)

* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
lncRNA_disease_causes = lncRNA_disease[lncRNA_disease['expression_detail'] == 'nan'].drop(columns=['expression_detail'])
lncRNA_disease_causes.head(n=3)

In [None]:
RNA_causes_or_contributes_to_condition_OBO = pd.concat([lncRNA_disease2_causes, lncRNA_disease_causes,circRNA_disease2_causes,
                                                        RNAdisease, trRF_disease, mirnet_causes,
                                                        hmdd_causes, miRNA_disease2, mircancer_causes])

RNA_causes_or_contributes_to_condition_OBO = RNA_causes_or_contributes_to_condition_OBO.groupby([':START_ID',':END_ID']).agg(
    {'PubMedID':set,'Method':set,'log2FC':np.mean,'Source':set,'Regulator':set,'Interactor':set,'Drug':set,'Location':set,
    'RNAsister_score':np.mean,	'log2FC':np.mean }).reset_index()

RNA_causes_or_contributes_to_condition_OBO[':TYPE'] = 'causes_or_contributes_to_condition'
RNA_causes_or_contributes_to_condition_OBO.to_pickle(unprocessed_edge_data_location+'RNA_causes_or_contributes_to_condition_OBO.pkl')
RNA_causes_or_contributes_to_condition_OBO.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0004013 (is causal germline mutation in) - OBO

* ClinVar

In [None]:
# download data
url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz'
data_downloader(url, unprocessed_data_location, 'variant_summary.txt.gz')

In [None]:
clinvar_data = pd.read_csv(unprocessed_data_location + 'variant_summary.txt', header=0,
                           delimiter='\t', low_memory=False)[['Name', 'GeneID','Assembly','ClinSigSimple',
                                                              'RS# (dbSNP)','PhenotypeIDS','ReviewStatus']]
clinvar_data = clinvar_data[clinvar_data['GeneID'] != -1].drop(columns=['GeneID'])
clinvar_data = clinvar_data[clinvar_data['ClinSigSimple'] == 1].drop(columns=['ClinSigSimple'])
clinvar_data = clinvar_data[(clinvar_data['ReviewStatus'] == "criteria provided, multiple submitters, no conflicts")
                            | (clinvar_data['ReviewStatus'] == "reviewed by expert panel")
                            | (clinvar_data['ReviewStatus'] == "practice guideline")].drop(columns=['ReviewStatus'])
clinvar_data = clinvar_data[clinvar_data['RS# (dbSNP)'] != -1]
clinvar_data['RS# (dbSNP)'] = 'rs' + clinvar_data['RS# (dbSNP)'].astype(str)
clinvar_data['Name'] = clinvar_data['Name'].str.split(".").str[0]
clinvar_data = clinvar_data[clinvar_data['Assembly'] == 'GRCh38'].drop(columns=['Assembly'])

clinvar_data = pd.merge(clinvar_data, rnacentral_map_human_refseq[['RNAcentral ID','RefSeq ID']].drop_duplicates().rename(
    columns={'RefSeq ID':'Name'}), on='Name', how='left')
clinvar_data = clinvar_data.drop(columns=['Name'])

clinvar_data['PhenotypeIDS'] = clinvar_data['PhenotypeIDS'].str.split("\|")
clinvar_data = clinvar_data.explode('PhenotypeIDS')
clinvar_data['PhenotypeIDS'] = clinvar_data['PhenotypeIDS'].str.split(";")
clinvar_data = clinvar_data.explode('PhenotypeIDS')
clinvar_data['PhenotypeIDS'] = clinvar_data['PhenotypeIDS'].str.split(",")
clinvar_data = clinvar_data.explode('PhenotypeIDS')
clinvar_data['PhenotypeIDS'] = clinvar_data['PhenotypeIDS'].str.replace("MONDO:MONDO:", "MONDO_")
clinvar_data['PhenotypeIDS'] = clinvar_data['PhenotypeIDS'].str.replace("Human Phenotype Ontology:HP:", "HP_")
clinvar_data = clinvar_data[(clinvar_data['PhenotypeIDS'].str.startswith('HP')) | (clinvar_data['PhenotypeIDS'].str.startswith('MONDO'))]
clinvar_data['Source'] = 'ClinVar'
clinvar_data.rename(columns={'PhenotypeIDS':':END_ID','RNAcentral ID':':START_ID', 'RS# (dbSNP)':'Mutation'},inplace=True)
clinvar_data.head(n=3)

* [PolymiRTS](https://compbio.uthsc.edu/miRSNP/home.php) <br /> PolymiRTS is a database of naturally occurring DNA variations in microRNA (miRNA) seed regions and miRNA target sites.

In [None]:
!wget https://compbio.uthsc.edu/miRSNP/download/PolymiRTS3.0/Genes_associated_with_human_diseases_traits.txt -O ../resources/processed_data/unprocessed_data/Genes_associated_with_human_diseases_traits.txt

In [None]:
mrna_disease = pd.read_csv(unprocessed_data_location + 'Genes_associated_with_human_diseases_traits.txt',
                           sep='\t').drop(columns=['Study','Link']) # Mondo+HPO 
mrna_disease['p-Value'] = pd.to_numeric(mrna_disease['p-Value'], errors='coerce')
mrna_disease = mrna_disease[mrna_disease['p-Value'] < 0.01]
mrna_disease['Disease/Trait'] = mrna_disease['Disease/Trait'].str.lower()
mrna_disease = pd.merge(mrna_disease, desc_disPhe_map.rename(columns={0:'Disease/Trait'}), on='Disease/Trait').drop(
    columns=['Disease/Trait']).rename(columns={1:'Disease'})

mrna_disease = pd.merge(rnacentral_map_human_refseq[['RNAcentral ID','RefSeq ID']].drop_duplicates().rename(
    columns={'RefSeq ID':'RefSeQID'}), mrna_disease, on='RefSeQID').drop(columns=['RefSeQID'])

mrna_disease['Source'] = 'PolymiRTS'

mrna_disease['PUBMEDID'] = pd.to_numeric(mrna_disease['PUBMEDID'], errors='coerce')
mrna_disease['PUBMEDID'] = mrna_disease['PUBMEDID'].astype(str)
mrna_disease['PUBMEDID'] = mrna_disease['PUBMEDID'].str.replace(".0", "")
mrna_disease['PUBMEDID'] = mrna_disease['PUBMEDID'].replace("<NA>", np.nan)
mrna_disease.rename(columns={'RNAcentral ID':':START_ID', 'Disease':':END_ID', 'p-Value':'p-value',
                             'SNPs':'Mutation','PUBMEDID':'PubMedID'}, inplace=True)
mrna_disease.head(n=3)

* [miRdSNP](http://mirdsnp.ccr.buffalo.edu/index.php) <br /> miRdSNP is a database of disease-associated SNPs and microRNA target sites on 3'UTRs of human genes.

In [None]:
miRdSNP = pd.read_csv(unprocessed_data_location+'mirdsnp-dsnp-generated-mir-targets-v11.03.csv') # Mondo+HPO
miRdSNP = miRdSNP[miRdSNP['experimentally_confirmed'] == 'Yes']
miRdSNP = miRdSNP.drop(columns=['gene_name','distance','experimentally_confirmed'])
miRdSNP.diseases = miRdSNP.diseases.str.lower()

miRdSNP = miRdSNP.merge(desc_disPhe_map, left_on='diseases', right_on=0).drop(
    columns=[0,'diseases']).rename(columns={1:'Disease'})

print(any(miRdSNP['miR'].isin(rnacentral_map_human['DB Description'])))
miRNA_RNA_miRNAnotInRNAcentral3p = miRdSNP[~miRdSNP['miR'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = miRdSNP[~miRdSNP['miR'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['miR'] = miRNA_RNA_miRNAnotInRNAcentral5p['miR'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['miR'] = miRNA_RNA_miRNAnotInRNAcentral3p['miR'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRdSNP = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'miR'}), on='miR').drop(columns=['DB','DB ID','miR','Organism','RNA category']).rename(columns={'RNAcentral ID':'RNA'})

miRdSNP = pd.merge(miRdSNP, rnacentral_map_human_refseq[['Label','RefSeq ID']].drop_duplicates().dropna(),
                   left_on='refseq_id', right_on='RefSeq ID', how='left')
miRdSNP['Label'] = miRdSNP['Label'].fillna(miRdSNP['refseq_id'])
miRdSNP = miRdSNP.drop(columns=['RefSeq ID', 'refseq_id'])
miRdSNP = miRdSNP.rename(columns={'Label':'Interactor'})

miRdSNP.rename(columns={'RNA':':START_ID','Disease':':END_ID', 'SNP':'Mutation'}, inplace=True)
miRdSNP['Source'] = 'miRdSNP'
miRdSNP.head(n=3)

In [None]:
RNA_is_causal_germline_mutation_in_OBO = pd.concat([clinvar_data, mrna_disease, miRdSNP])
RNA_is_causal_germline_mutation_in_OBO = RNA_is_causal_germline_mutation_in_OBO.groupby(
    [':START_ID',':END_ID']).agg({'Mutation':set,'Source':set,'PubMedID':set,'p-value':np.mean,'Interactor':set}).reset_index()
RNA_is_causal_germline_mutation_in_OBO[':TYPE'] = 'is_causal_germline_mutation_in'
RNA_is_causal_germline_mutation_in_OBO.to_pickle(unprocessed_edge_data_location+'RNA_is_causal_germline_mutation_in_OBO.pkl')

OBO_has_material_basis_in_germline_mutation_in_RNA = RNA_is_causal_germline_mutation_in_OBO.rename(
    columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_has_material_basis_in_germline_mutation_in_RNA[':TYPE'] = 'has_material_basis_in_germline_mutation_in'
OBO_has_material_basis_in_germline_mutation_in_RNA.to_pickle(
    unprocessed_edge_data_location+'OBO_has_material_basis_in_germline_mutation_in_RNA.pkl')
OBO_has_material_basis_in_germline_mutation_in_RNA.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0004014 (is causal somatic mutation in) - OBO

* [LncBook](https://ngdc.cncb.ac.cn/lncbook/)

In [None]:
!wget https://ngdc.cncb.ac.cn/lncbook/files/variation_LncBook2.0.csv.gz -O ../resources/processed_data/unprocessed_data/variation_LncBook2.0.csv.gz

In [None]:
lncRNA_disease2 = pd.read_csv(unprocessed_data_location+'variation_LncBook2.0.csv.gz').drop(
    columns=['Symbol','ClinVar Allele ID','ClinVar Variation Effect','ClinVar Disease Name','Variant Name','dbSNP ID']) # Mondo+HPO
print(lncRNA_disease2['COSMIC Variation Effect'].unique())
lncRNA_disease2 = lncRNA_disease2[lncRNA_disease2['COSMIC Variation Effect'] == 'Pathogenic'].drop(columns=['COSMIC Variation Effect'])
lncRNA_disease2 = lncRNA_disease2[lncRNA_disease2['COSMIC Tumor Name'] != '-']
lncRNA_disease2['COSMIC Tumor Name'] = lncRNA_disease2['COSMIC Tumor Name'].str.split(';')
lncRNA_disease2 = lncRNA_disease2.explode('COSMIC Tumor Name')
lncRNA_disease2['COSMIC Tumor Name'] = lncRNA_disease2['COSMIC Tumor Name'].str.replace(r"\(.*?\)", "", regex=True)
lncRNA_disease2['COSMIC Tumor Name'] = [desc[1:] if desc.startswith(' ') else
                                       desc for desc in lncRNA_disease2['COSMIC Tumor Name']]
lncRNA_disease2['COSMIC Tumor Name'] = lncRNA_disease2['COSMIC Tumor Name'].str.lower()

lncRNA_disease2 = pd.merge(desc_disPhe_map.rename(columns={0:'COSMIC Tumor Name'}), lncRNA_disease2, on=['COSMIC Tumor Name']).drop(
    columns=['COSMIC Tumor Name']).rename(columns={1:'Disease'})
lncRNA_disease2 = pd.merge(lncRNA_disease2, rnacentral_map_human_lncbook[['RNAcentral ID','LncBook Gene ID']].drop_duplicates().rename(
    columns={'LncBook Gene ID':'Gene ID'}), on='Gene ID').drop(columns=['Gene ID']).rename(columns={'RNAcentral ID':'RNA'})

lncRNA_disease2['Source'] = 'LncBook'
lncRNA_disease2.rename(columns={'RNA':':START_ID','Disease':':END_ID','COSMIC Mutation ID':'Mutation'}, inplace=True) 
lncRNA_disease2.head(n=3)

* SomamiR

In [None]:
miRNA_lncRNA2 = pd.read_csv(unprocessed_data_location+'lncRNA_somatic_v2.0.txt.tar.gz',sep='\t',dtype={'PMID':str})
miRNA_lncRNA2.drop(columns=['Unnamed: 18'],inplace=True) # Mondo+HPO
miRNA_lncRNA2.rename(columns={'lncRNA_somatic_v2.0.txt':'Gene'},inplace=True)
miRNA_lncRNA2['Gene'] = miRNA_lncRNA2['Gene'].str.replace(r'lnc-', '')
miRNA_lncRNA2.rename(columns={'Gene': 'symbol', 'miRNA':'mir_id'}, inplace=True)
miRNA_lncRNA2 = miRNA_lncRNA2[miRNA_lncRNA2['TargetScan_Site(0=No;1=Yes)'] == 1]

lncpedia_map = pd.read_csv("https://lncipedia.org/downloads/lncipedia_5_0/lncipedia_5_0_vs_5_2.txt", sep='\t')
miRNA_lncRNA2 = miRNA_lncRNA2.merge(lncpedia_map.rename(columns={'LNCipedia 5.0 Transcript ID':'Transcript'}),
                                    on='Transcript').drop(columns=['Transcript']).rename(
                                        columns={'LNCipedia 5.2 Transcript ID':'Transcript'})
miRNA_lncRNA2 = pd.merge(rnacentral_map_human_lncipedia.rename(columns={'LNCipedia transcript ID':'Transcript'}),
                         miRNA_lncRNA2, on='Transcript').drop(columns=['Transcript','LNCipedia Gene ID']).rename(
                             columns={'RNAcentral ID':'RNA'})

miRNA_lncRNA2 = miRNA_lncRNA2.drop(columns=['symbol','Chromosome','Location','Ref_Allele','Sample_Name',
                                            'Mut_Allele','FuncClass','Alteration','Target_Site',
                                            'Seed','SeedClass', 'TargetScan_Site(0=No;1=Yes)','Mut_ID'])

miRNA_lncRNA2 = miRNA_lncRNA2[miRNA_lncRNA2['Cancer_Class'].notna()]
miRNA_lncRNA2['Cancer_Class'] = miRNA_lncRNA2['Cancer_Class'].str.lower()
miRNA_lncRNA2['Cancer_Class'].str.replace('[ns]','[cancer]')
miRNA_lncRNA2['Cancer_Class'] = miRNA_lncRNA2['Cancer_Class'].apply(
    lambda x: ' '.join([x.split('[')[1].split(']')[0], x.split('[')[-1].split(']')[0]])).str.replace('_', ' ')

miRNA_lncRNA2 = miRNA_lncRNA2.merge(desc_disPhe_map, left_on='Cancer_Class', right_on=0).drop(
    columns=[0,'Cancer_Class']).rename(columns={1:'Disease'})

miRNA_lncRNA2['PMID'] = pd.to_numeric(miRNA_lncRNA2['PMID'], errors='coerce')
miRNA_lncRNA2['PMID'] = miRNA_lncRNA2['PMID'].astype(str)
miRNA_lncRNA2['PMID'] = miRNA_lncRNA2['PMID'].str.replace(".0", "")
miRNA_lncRNA2['PMID'] = miRNA_lncRNA2['PMID'].replace("<NA>", np.nan)

miRNA_lncRNA2['Source'] = 'SomamiR'
miRNA_lncRNA3 = miRNA_lncRNA2.copy()
miRNA_lncRNA3['Source'] = 'TargetScan'
miRNA_lncRNA2 = pd.concat([miRNA_lncRNA2, miRNA_lncRNA3])
miRNA_lncRNA2.rename(columns={'RNA':':START_ID','Disease':':END_ID','COSMIC_ID':'Mutation','mir_id':'Interactor','PMID':'PubMedID'}, inplace=True)
miRNA_lncRNA2.head(n=3)

In [None]:
data_downloader('https://compbio.uthsc.edu/SomamiR/download/miRNA_somatic_v2.0.txt.tar.gz', unprocessed_data_location)

In [None]:
somamir = pd.read_csv(unprocessed_data_location +
                      'miRNA_somatic_v2.0.txt.tar',sep='\t').drop(
                          columns=['Reference','Derived','SNP','Whole_Genome','Whole_Exome','Study_ID','Source',
                          'miRNA_Chromosome','Strand','Maturestart','Matureend','Mutation_Distance','Regioin',
                            'miR2GO_Execution_Sequence','Unnamed: 19','Sample_Name']) # Mondo+HPO
somamir.Cancer_Type = somamir.Cancer_Type.str.replace("[NS]","")
somamir = somamir[somamir.Cancer_Type!=""]
somamir['Cancer_Type'] = somamir['Cancer_Type'].apply(
    lambda x: ' '.join([x.split('[')[1].split(']')[0], x.split('[')[-1].split(']')[0]])).str.replace('_', ' ')

somamir = somamir.merge(desc_disPhe_map, left_on='Cancer_Type', right_on=0).drop(
    columns=[0,'Cancer_Type']).rename(columns={1:'Disease'})

print(all(somamir['miRNA_Name'].isin(rnacentral_map_human['DB Description'])))
print(somamir[~somamir['miRNA_Name'].isin(rnacentral_map_human['DB Description'])]['miRNA_Name'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = somamir[~somamir['miRNA_Name'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = somamir[~somamir['miRNA_Name'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['miRNA_Name'] = miRNA_RNA_miRNAnotInRNAcentral5p['miRNA_Name'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['miRNA_Name'] = miRNA_RNA_miRNAnotInRNAcentral3p['miRNA_Name'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'miRNA_Name'}), on='miRNA_Name').drop(columns=['miRNA_Name']).rename(columns={'RNAcentral ID':'RNA'})

somamir = pd.merge(somamir, rnacentral_map_human.rename(
    columns={'DB Description':'miRNA_Name'}), on='miRNA_Name').drop(
        columns=['DB','DB ID','miRNA_Name','Organism','RNA category']).rename(columns={'RNAcentral ID':'RNA'})
somamir = pd.concat([somamir, miRNA_RNA_miRNAnotInRNAcentral]).drop(columns=['DB','DB ID','Mutation_ID','Organism','RNA category'])

somamir['Source'] = 'SomamiR'
somamir = somamir.rename(columns={'RNA':':START_ID','Disease':':END_ID','COSMIC_ID':'Mutation'})
somamir.head(n=3)

In [None]:
circRNA_miRNA = pd.read_csv(unprocessed_data_location + 'circRNA_somatic_v2.0.txt.tar.gz', sep="\t")
circRNA_miRNA = circRNA_miRNA[circRNA_miRNA['TargetScan_Site(0=No;1=Yes)'] == 1]
circRNA_miRNA = circRNA_miRNA.drop(columns=['Gene','Mut_ID','Chromosome','Location','Mut_Allele','FuncClass','Alteration','Ref_Allele',
                                            'Target_Site','Seed','SeedClass','TargetScan_Site(0=No;1=Yes)','Sample_Name','Unnamed: 18'])

circRNA_miRNA['PMID'] = pd.to_numeric(circRNA_miRNA['PMID'], errors='coerce')
circRNA_miRNA['PMID'] = circRNA_miRNA['PMID'].astype(str)
circRNA_miRNA['PMID'] = circRNA_miRNA['PMID'].str.replace(".0", "")
circRNA_miRNA['PMID'] = circRNA_miRNA['PMID'].replace("<NA>", np.nan)
circRNA_miRNA['PMID'] = circRNA_miRNA['PMID'].replace("nan", np.nan)

circRNA_miRNA['Cancer_Class'] = circRNA_miRNA['Cancer_Class'].str.lower()
circRNA_miRNA['Cancer_Class'] = circRNA_miRNA['Cancer_Class'].str.replace('[ns]','')
circRNA_miRNA['Cancer_Class'] = circRNA_miRNA['Cancer_Class'].str.replace('_',' ')
circRNA_miRNA['Cancer_Class'] = circRNA_miRNA['Cancer_Class'].astype(str).apply(
     lambda x: ' '.join(re.findall(r'\[(.*?)\]', x)) if '[' in x and ']' in x else x)
circRNA_miRNA = circRNA_miRNA.explode('Cancer_Class')
circRNA_miRNA = pd.merge(circRNA_miRNA, desc_disPhe_map.rename(columns={0:'Cancer_Class'}), on='Cancer_Class')

circRNA_miRNA['Source'] = 'SomamiR'
circRNA_miRNA2 = circRNA_miRNA.copy()
circRNA_miRNA2['Source'] = 'TargetScan'
circRNA_miRNA = pd.concat([circRNA_miRNA, circRNA_miRNA2])
RNA_RNA8 = circRNA_miRNA.rename(columns={'Transcript':':START_ID','COSMIC_ID':'Mutation','miRNA':'Interactor',
                                         'PMID':'PubMedID',1:':END_ID'}).drop_duplicates()
RNA_RNA8.head(n=3)

In [None]:
RNA_is_causal_germline_mutation_in_OBO = pd.concat([somamir, miRNA_lncRNA2, lncRNA_disease2, RNA_RNA8])
RNA_is_causal_germline_mutation_in_OBO = RNA_is_causal_germline_mutation_in_OBO.groupby(
    [':START_ID', ':END_ID']).agg({'Mutation':set,'Source':set,'PubMedID':set,'Interactor':set}).reset_index()
RNA_is_causal_germline_mutation_in_OBO[':TYPE'] = 'is_causal_somatic_mutation_in'
RNA_is_causal_germline_mutation_in_OBO.to_pickle(unprocessed_edge_data_location+'RNA_is_causal_somatic_mutation_in_OBO.pkl')

OBO_has_material_basis_in_germline_mutation_in_RNA = RNA_is_causal_germline_mutation_in_OBO.rename(
    columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_has_material_basis_in_germline_mutation_in_RNA[':TYPE'] = 'has_material_basis_in_somatic_mutation_in'
OBO_has_material_basis_in_germline_mutation_in_RNA.to_pickle(
    unprocessed_edge_data_location+'OBO_has_material_basis_in_somatic_mutation_in_RNA.pkl')
OBO_has_material_basis_in_germline_mutation_in_RNA.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002436?hosted (molecularly interacts with?molecularly hosted by) - Gene

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/) <br /> snoDB is an interactive database of human small nucleolar RNAs (snoRNAs) that includes up-to-date information on snoRNA features, genomic location, conservation, host gene, snoRNA-RNA targets and snoRNA abundance and provides links to other resources.

In [None]:
data_downloader('https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/download_all', unprocessed_data_location)

In [None]:
snoDB = pd.read_csv(unprocessed_data_location + 'download_all', sep="\t")
snoDB = snoDB[['rna_central_id','host_gene_id','rrna_targets','snrna_targets','lncrna_targets','protein_coding_targets','snorna_targets',
               'mirna_targets','trna_targets','ncrna_targets','pseudogene_targets','other_targets','is_expressed']]
snoDB = snoDB[snoDB['rna_central_id'].notna()]
snoDB = snoDB.rename(columns={'rna_central_id':'RNA'})
for col in snoDB.columns:
    snoDB[col] = snoDB[col].astype(str).str.split(';')
for col in snoDB.columns:
    snoDB = snoDB.explode(col) 
snoDB['Source'] = 'snoDB'
snoDB.head(n=3)

In [None]:
snoRNA_gene = snoDB[['RNA', 'host_gene_id', 'is_expressed', 'Source']]
snoRNA_gene = snoRNA_gene[snoRNA_gene['host_gene_id'] != 'nan']
snoRNA_gene = snoRNA_gene.merge(ensembl_entrezGene_map.rename(columns={0:'host_gene_id'}), on='host_gene_id').drop(
    columns=['host_gene_id']).rename(columns={1:'Gene'})
snoRNA_gene = snoRNA_gene[['RNA', 'Gene', 'Source']]
snoRNA_gene = snoRNA_gene.groupby(['RNA','Gene']).agg({'Source':set}).reset_index()
snoRNA_gene[':TYPE'] = 'molecularly_hosted_by'
snoRNA_gene.rename(columns={'RNA':':START_ID','Gene':':END_ID'}).to_pickle(unprocessed_edge_data_location+"RNA_molecularly_hosted_by_gene.pkl")
snoRNA_gene[':TYPE'] = 'molecularly_hosts'
snoRNA_gene.rename(columns={'RNA':':END_ID','Gene':':START_ID'}).to_pickle(unprocessed_edge_data_location+"gene_molecularly_hosts_RNA.pkl")
snoRNA_gene.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002434 (interacts with) - RNA

* [RNAInter](http://www.rnainter.org/) <br/> RNAInter integrates experimentally validated and computationally predicted RNA interactome data from the literature and databases.

In [None]:
!wget http://www.rnainter.org/raidMedia/download/Download_data_RR.tar.gz -O ../resources/processed_data/unprocessed_data/Download_data_RR.tar.gz

In [None]:
RNA_RNA = pd.read_csv(unprocessed_data_location+'Download_data_RR.tar.gz',sep='\t').rename(columns={'Download_data_RR.txt':'RNAInterID'})

# We select only strong evidence interactions for hsa
RNA_RNA = RNA_RNA[(RNA_RNA['score'] >= 0.2886) &
                  (RNA_RNA['Species1'].str.contains('apiens')) &
                  (RNA_RNA['Species2'].str.contains('apiens'))]

RNA_RNA.Category1 = RNA_RNA.Category1.str.replace("PCG", 'mRNA')
RNA_RNA.Category2 = RNA_RNA.Category2.str.replace("PCG", 'mRNA')

RNA_RNA.Raw_ID1 = RNA_RNA.Raw_ID1.str.replace("NCBI:", '')
RNA_RNA.Raw_ID2 = RNA_RNA.Raw_ID2.str.replace("NCBI:", '')

RNA_RNA.Raw_ID1 = RNA_RNA.Raw_ID1.str.replace("miRBase:", '')
RNA_RNA.Raw_ID2 = RNA_RNA.Raw_ID2.str.replace("miRBase:", '')

RNA_RNA.Raw_ID1 = RNA_RNA.Raw_ID1.str.replace("circBase:", '')
RNA_RNA.Raw_ID2 = RNA_RNA.Raw_ID2.str.replace("circBase:", '')

# tRF
RNA_RNA.Raw_ID1 = RNA_RNA.Raw_ID1.str.replace("tRFdb:", 'trfdb?')
RNA_RNA.Raw_ID2 = RNA_RNA.Raw_ID2.str.replace("tRFdb:", 'trfdb?')

RNA_RNA['Raw_ID1'] = RNA_RNA['Raw_ID1'].str.split(';')
RNA_RNA = RNA_RNA.explode('Raw_ID1')
RNA_RNA['Raw_ID2'] = RNA_RNA['Raw_ID2'].str.split(';')
RNA_RNA = RNA_RNA.explode('Raw_ID2')

RNA_RNA['Interactor1.Symbol'] = RNA_RNA['Interactor1.Symbol'].str.split('.').str[0]
RNA_RNA['Interactor2.Symbol'] = RNA_RNA['Interactor2.Symbol'].str.split('.').str[0]

# Edge properties
RNA_RNA.strong = RNA_RNA.strong.str.replace('//','|')
RNA_RNA.strong = RNA_RNA.strong.str.lower()

RNA_RNA.weak = RNA_RNA.weak.str.replace('//','|')
RNA_RNA.weak = RNA_RNA.weak.str.lower()

RNA_RNA.predict = RNA_RNA.predict.str.replace('//','|')
RNA_RNA.predict = RNA_RNA.predict.str.lower()

RNA_RNA.head(n=3)

In [None]:
a = set(RNA_RNA.Category1)
b = set(RNA_RNA.Category2)
a.union(b)

In [None]:
RNA_RNA = pd.merge(RNA_RNA, rnacentral_map_human_mirbase[['miRBase ID', 'RNAcentral ID']].drop_duplicates(), left_on=['Raw_ID1'],
                   right_on=['miRBase ID'], how="left").drop(columns=["miRBase ID"])

RNA_RNA['RNAcentral ID'] = RNA_RNA['RNAcentral ID'].fillna(RNA_RNA['Raw_ID1'])
RNA_RNA.drop(columns=['Raw_ID1'],inplace=True)
RNA_RNA.rename(columns={'RNAcentral ID':'Raw_ID1'},inplace=True)

RNA_RNA = pd.merge(RNA_RNA, rnacentral_map_human_mirbase[['miRBase ID', 'RNAcentral ID']].drop_duplicates(), left_on=['Raw_ID2'],
                   right_on=['miRBase ID'], how="left").drop(columns=["miRBase ID"])

RNA_RNA['RNAcentral ID'] = RNA_RNA['RNAcentral ID'].fillna(RNA_RNA['Raw_ID2'])
RNA_RNA.drop(columns=['Raw_ID2'],inplace=True)
RNA_RNA.rename(columns={'RNAcentral ID':'Raw_ID2'},inplace=True)

RNA_RNA.head(n=2)

In [None]:
print(RNA_RNA[RNA_RNA['Category2'] == 'piRNA'])

In [None]:
print(RNA_RNA[RNA_RNA['Category2'] == 'piRNA'])
RNA_RNA.loc[31571, 'Raw_ID2'] = 'piR-hsa-39980'
RNA_RNA.loc[39194, 'Raw_ID2'] = 'piR-hsa-20280'

In [None]:
print(RNA_RNA[RNA_RNA['Category1'] == 'tRNA'])

In [None]:
print(RNA_RNA[RNA_RNA['Category1'] == 'tRNA'])
RNA_RNA.loc[23191, 'Raw_ID1'] = 'URS0000287398'
RNA_RNA.loc[23192, 'Raw_ID1'] = 'URS00003C9A26'

In [None]:
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("transcribed_unitary_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("transcribed_unprocessed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("transcribed_processed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("unprocessed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("processed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("IG_V_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("unitary_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("TR_J_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("TR_V_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("IG_C_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("IG_J_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("translated_processed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("IG_pseudogene","pseudo")

In [None]:
i = RNA_RNA[(RNA_RNA['Category1']=='pseudo')].index.values
pseudo = RNA_RNA[RNA_RNA['Category1']=='pseudo']
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ensembl_entrezTranscript_map_pseudo = ensembl_entrezTranscript_map[ensembl_entrezTranscript_map[2] == 'pseudo']
pseudo = pd.merge(pseudo, ensembl_entrezTranscript_map_pseudo, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1',2,3,4,5]).rename(columns={0:'Raw_ID1'})
pseudo.drop(columns=['Raw_ID1'],inplace=True)
pseudo.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([pseudo,RNA_RNA])

i = RNA_RNA[RNA_RNA['Category2']=='pseudo'].index.values
pseudo = RNA_RNA[RNA_RNA['Category2']=='pseudo']
pseudo = pd.merge(pseudo, ensembl_entrezTranscript_map_pseudo, left_on=['Raw_ID2'],
                right_on=[0]).drop(columns=['Raw_ID2',2,3,4,5]).rename(columns={0:'Raw_ID2'})
pseudo.drop(columns=['Raw_ID2'],inplace=True)
pseudo.rename(columns={1:'Raw_ID2'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([pseudo,RNA_RNA])

pseudo.head(n=3)

In [None]:
i = RNA_RNA[(RNA_RNA['Category1']=='mRNA')].index.values
mrna = RNA_RNA[RNA_RNA['Category1']=='mRNA']
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ensembl_entrezTranscript_map_mrna = ensembl_entrezTranscript_map[ensembl_entrezTranscript_map[2] == 'protein-coding']
mrna = pd.merge(mrna, ensembl_entrezTranscript_map_mrna, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1',2,3,4,5]).rename(columns={0:'Raw_ID1'})
mrna.drop(columns=['Raw_ID1'],inplace=True)
mrna.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([mrna,RNA_RNA])

i = RNA_RNA[RNA_RNA['Category2']=='mRNA'].index.values
mrna = RNA_RNA[RNA_RNA['Category2']=='mRNA']
mrna = pd.merge(mrna, ensembl_entrezTranscript_map_mrna, left_on=['Raw_ID2'],
                right_on=[0]).drop(columns=['Raw_ID2',2,3,4,5]).rename(columns={0:'Raw_ID2'})
mrna.drop(columns=['Raw_ID2'],inplace=True)
mrna.rename(columns={1:'Raw_ID2'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([mrna,RNA_RNA])

mrna.head(n=3)

In [None]:
i = RNA_RNA[(RNA_RNA['Category1']=='protein') | (RNA_RNA['Category1']=='TF')].index.values
protein = RNA_RNA[(RNA_RNA['Category1']=='protein') | (RNA_RNA['Category1']=='TF')]
entrez_pro_map[0] = entrez_pro_map[0].astype(str)
protein = pd.merge(protein, entrez_pro_map, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1']).rename(columns={0:'Raw_ID1'})
protein.drop(columns=['Raw_ID1'],inplace=True)
protein.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([protein,RNA_RNA])

i = RNA_RNA[(RNA_RNA['Category2']=='protein') | (RNA_RNA['Category2']=='TF')].index.values
protein = RNA_RNA[(RNA_RNA['Category2']=='protein') | (RNA_RNA['Category2']=='TF')]
protein = pd.merge(protein, entrez_pro_map, left_on=['Raw_ID2'],
                right_on=[0]).drop(columns=['Raw_ID2']).rename(columns={0:'Raw_ID2'})
protein.drop(columns=['Raw_ID2'],inplace=True)
protein.rename(columns={1:'Raw_ID2'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([protein,RNA_RNA])

protein.head(n=3)

In [None]:
i = RNA_RNA[(RNA_RNA['Category1']!='protein') & (RNA_RNA['Category1']!='TF') & (RNA_RNA['Category1']!='mRNA') & 
            (RNA_RNA['Category1']!='pseudo') & (RNA_RNA['Category1']!='piRNA') & (RNA_RNA['Category1']!='miRNA')
            & (RNA_RNA['Category1']!='eRNA') & (RNA_RNA['Category1']!='circRNA') & (RNA_RNA['Category1']!='tRF')].index.values
ncrna = RNA_RNA[(RNA_RNA['Category1']!='protein') & (RNA_RNA['Category1']!='TF') & (RNA_RNA['Category1']!='mRNA')
            & (RNA_RNA['Category1']!='pseudo') & (RNA_RNA['Category1']!='piRNA') & (RNA_RNA['Category1']!='miRNA')
            & (RNA_RNA['Category1']!='eRNA') & (RNA_RNA['Category1']!='circRNA') & (RNA_RNA['Category1']!='tRF')]
rnacentral_map_human_hgnc_type = rnacentral_map_hgnc[rnacentral_map_hgnc['Organism'] ==9606]
ncrna = pd.merge(ncrna, rnacentral_map_human_hgnc_type, left_on=['Interactor1.Symbol','Category1'],
                right_on=["HGNC symbol",'RNA category']).drop(columns=["Interactor1.Symbol",'Raw_ID1','RNA category'])
ncrna.rename(columns={"RNAcentral ID":'Raw_ID1'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([ncrna,RNA_RNA])

i = RNA_RNA[(RNA_RNA['Category2']!='protein') & (RNA_RNA['Category2']!='TF') & (RNA_RNA['Category2']!='mRNA')
            & (RNA_RNA['Category2']!='pseudo') & (RNA_RNA['Category2']!='piRNA') & (RNA_RNA['Category2']!='miRNA')
            & (RNA_RNA['Category2']!='eRNA') & (RNA_RNA['Category2']!='circRNA') & (RNA_RNA['Category2']!='tRF')].index.values
ncrna = RNA_RNA[(RNA_RNA['Category2']!='protein') & (RNA_RNA['Category2']!='TF') & (RNA_RNA['Category2']!='mRNA')
            & (RNA_RNA['Category2']!='pseudo') & (RNA_RNA['Category2']!='piRNA') & (RNA_RNA['Category2']!='miRNA')
            & (RNA_RNA['Category2']!='eRNA') & (RNA_RNA['Category2']!='circRNA') & (RNA_RNA['Category2']!='tRF')]
ncrna = pd.merge(ncrna, rnacentral_map_human_hgnc_type, left_on=['Interactor2.Symbol','Category2'],
                right_on=["HGNC symbol",'RNA category']).drop(columns=["Interactor2.Symbol",'Raw_ID2','RNA category'])
ncrna.rename(columns={"RNAcentral ID":'Raw_ID2'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([ncrna,RNA_RNA])
RNA_RNA = RNA_RNA[['Raw_ID1','Raw_ID2','Category1','Category2','score','strong','weak','predict']]

ncrna.head(n=3)

In [None]:
a = set(RNA_RNA.Category1)
b = set(RNA_RNA.Category2)
a.union(b)

In [None]:
# Mislabeled data
RNA_protein_ = RNA_RNA[(RNA_RNA['Raw_ID1'].str.startswith("PR")) | (RNA_RNA['Raw_ID2'].str.startswith("PR"))]

In [None]:
RNA_RNA = RNA_RNA[(RNA_RNA['Raw_ID1'].str.startswith("URS")) | (RNA_RNA['Raw_ID1'].str.startswith("ENST")) |
                   (RNA_RNA['Raw_ID1'].str.startswith("hsa_circ")) | (RNA_RNA['Raw_ID1'].str.startswith("trfdb?"))]
RNA_RNA = RNA_RNA[(RNA_RNA['Raw_ID2'].str.startswith("URS")) | (RNA_RNA['Raw_ID2'].str.startswith("ENST")) |
                   (RNA_RNA['Raw_ID2'].str.startswith("hsa_circ")) | (RNA_RNA['Raw_ID2'].str.startswith("trfdb?"))]

RNA_RNA['Method'] = RNA_RNA.strong + "|" + RNA_RNA.weak + "|" + RNA_RNA.predict
RNA_RNA = RNA_RNA[['Raw_ID1','Raw_ID2','Method','score']]

RNA_RNA.head(n=3)

In [None]:
RNA_RNA['Method'] = RNA_RNA['Method'].str.lower().str.split("|")
RNA_RNA = RNA_RNA.explode('Method')
RNA_RNA = pd.merge(RNA_RNA, location_map, right_on='0_y', left_on='Method', how='left')
RNA_RNA['0_x'] = RNA_RNA['0_x'].fillna(RNA_RNA['Method'])
RNA_RNA = RNA_RNA.drop(columns=['0_y', 'Method'])
RNA_RNA = RNA_RNA.rename(columns={'0_x':'Method'})

RNA_RNA['Source'] = 'RNAInter'
RNA_RNA = RNA_RNA.rename(columns={'Raw_ID1':':START_ID','Raw_ID2':':END_ID','score':'RNAsister_score'}).drop_duplicates()
RNA_RNA.head(n=3)

* [PolymiRTS](https://compbio.uthsc.edu/miRSNP/home.php)

In [None]:
mirna_mrna = pd.read_csv(unprocessed_data_location + 'target_miRSNP_human_CLASH.txt',sep='\t')[['refseq','microRNA_name','mutid']]
mirna_mrna_5p = mirna_mrna[mirna_mrna['microRNA_name'].str.contains("\\*")]
mirna_mrna = mirna_mrna[~mirna_mrna['microRNA_name'].str.contains("\\*")]
mirna_mrna_3p = mirna_mrna_5p.copy()
mirna_mrna_5p['microRNA_name'] = mirna_mrna['microRNA_name'].str.replace('*', '-5p')
mirna_mrna_3p['microRNA_name'] = mirna_mrna['microRNA_name'].str.replace('*', '-3p')
mirna_mrna = pd.concat([mirna_mrna, mirna_mrna_5p, mirna_mrna_3p])
mirna_mrna = mirna_mrna[(mirna_mrna['microRNA_name'].notna()) & (mirna_mrna['refseq'].notna())]
print(all(mirna_mrna['microRNA_name'].isin(rnacentral_map_human['DB Description'])))
print(mirna_mrna[mirna_mrna['microRNA_name'].isin(rnacentral_map_human['DB Description'])]['microRNA_name'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = mirna_mrna[~mirna_mrna['microRNA_name'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = mirna_mrna[~mirna_mrna['microRNA_name'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['microRNA_name'] = miRNA_RNA_miRNAnotInRNAcentral5p['microRNA_name'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['microRNA_name'] = miRNA_RNA_miRNAnotInRNAcentral3p['microRNA_name'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human[['DB Description','RNAcentral ID']].drop_duplicates().rename(
    columns={'DB Description':'microRNA_name'}), on='microRNA_name').drop(columns=['microRNA_name']).rename(columns={'RNAcentral ID':'RNA1'})

mirna_mrna = pd.merge(mirna_mrna, rnacentral_map_human[['DB Description','RNAcentral ID']].drop_duplicates().rename(
    columns={'DB Description':'microRNA_name'}), on='microRNA_name')
mirna_mrna = pd.concat([mirna_mrna.rename(columns={'RNAcentral ID':'RNA1'}),
                       miRNA_RNA_miRNAnotInRNAcentral]).drop(columns=['microRNA_name'])

mirna_mrna = pd.merge(rnacentral_map_human_refseq[['RNAcentral ID', 'RefSeq ID']].drop_duplicates(), mirna_mrna, left_on='RefSeq ID',
                        right_on='refseq').drop(columns=['RefSeq ID', 'refseq'])

RNA_RNA2 = mirna_mrna.rename(columns={'RNAcentral ID':':START_ID', 'RNA1':':END_ID', 'mutid':'Mutation'})
RNA_RNA2['Source'] = "PolymiRTS"
RNA_RNA2.head(n=3)

* [miRNet](https://www.mirnet.ca/)

In [None]:
!wget -O https://www.dropbox.com/s/oxraur4z5921sg4/miRNet-mir-circRNA.csv?dl=0 -P ../resources/processed_data/unprocessed_data/

In [None]:
circRNA_miRNA2 = pd.read_csv(unprocessed_data_location + 'miRNet-mir-circRNA.csv?dl=0')
circRNA_miRNA2 = circRNA_miRNA2.drop(columns=['mirnet','mir_id','symbol','entrez','mbv','embl','gene_name'])
circRNA_miRNA2 = pd.merge(circRNA_miRNA2, circbase_map.rename(
    columns={1:'genbank_id'}), on='genbank_id').drop(columns=['genbank_id']).rename(columns={0:'RNA1'})

print(all(circRNA_miRNA2['mir_acc'].isin(rnacentral_map_human_mirbase['miRBase ID'])))
circRNA_miRNA2 = pd.merge(circRNA_miRNA2, rnacentral_map_human_mirbase.rename(columns={'miRBase ID':'mir_acc'}), on='mir_acc').drop(
    columns=['mir_acc']).rename(columns={'RNAcentral ID':'RNA2'})
circRNA_miRNA2['Source'] = 'miRNet'
circRNA_miRNA2 = circRNA_miRNA2[circRNA_miRNA2['RNA1'].str.startswith("hsa_circ_")]
RNA_RNA6 = circRNA_miRNA2.rename(columns={'RNA1':':START_ID','RNA2':':END_ID'})
RNA_RNA6.head(n=3)

* [LncExpDB](https://ngdc.cncb.ac.cn/lncexpdb/) <br /> LncExpDB is a comprehensive database for lncRNA expression. It covers expression profiles of lncRNA genes across various biological contexts, predicts potential functional lncRNAs and their interacting partners, and thus provides essential guidance on experimental design.

In [None]:
# https://ngdc.cncb.ac.cn/lncexpdb/interactions --> Download button
lncRNA_mRNA = pd.read_csv(unprocessed_data_location + 'interaction.txt', sep='\t').drop(columns=['ID','lncname','pcgname',
                                                                                                 'lnclocation','pcglocation'])
lncRNA_mRNA['pcg'] = lncRNA_mRNA['pcg'].str.split('.').str[0]
# "Breadth" column indicates how broadly the lncRNA is expressed across different biological contexts.
# A threshold of 3 allows to focus on relationships that are likely to be conserved or important across different biological conditions.
lncRNA_mRNA = lncRNA_mRNA[lncRNA_mRNA['breadth']>=3]
lncRNA_mRNA = pd.merge(lncRNA_mRNA, rnacentral_map_human_lncbook[['LncBook Gene ID','RNAcentral ID']].rename(
    columns={'LncBook Gene ID':'geneid'}), on='geneid').rename(columns={'RNAcentral ID':'RNA1'}).drop(columns=['geneid'])
ensembl_map_lncRNA = ensembl_map[['transcript_stable_id','ensembl_gene_id','ensembl_transcript_type']].rename(columns={'ensembl_gene_id':'pcg'})
ensembl_map_lncRNA = ensembl_map_lncRNA[ensembl_map_lncRNA['ensembl_transcript_type'] == 'protein_coding']
ensembl_map_lncRNA = ensembl_map_lncRNA.drop(columns=['ensembl_transcript_type']).drop_duplicates()
lncRNA_mRNA = pd.merge(lncRNA_mRNA, ensembl_map_lncRNA, on='pcg').rename(columns={'transcript_stable_id':'RNA2'}).drop(columns=['pcg'])
lncRNA_mRNA.context = lncRNA_mRNA.context.str.split(',')
lncRNA_mRNA = lncRNA_mRNA.explode('context')
# Exclude -1 distances before calculating mean distance; -1 means the 2 RNAs are not on the same chromosome ("trans")
lncRNA_mRNA['distance'] = lncRNA_mRNA['distance'].replace(-1, np.nan)
lncRNA_mRNA['Source'] = 'LncExpDB'
lncRNA_mRNA.rename(columns={'RNA1':':START_ID','RNA2':':END_ID','distance':'Distance'},inplace=True)

lncRNA_mRNA['Location'] = lncRNA_mRNA.context
lncRNA_mRNA['Location'] = lncRNA_mRNA['Location'].replace('Normal', np.nan)
lncRNA_mRNA['Location'] = lncRNA_mRNA['Location'].replace('Normal Tissue/Cell Line', np.nan)
lncRNA_mRNA['Location'] = lncRNA_mRNA['Location'].str.replace('OrganDevelopment','organ development', regex=False)
lncRNA_mRNA['Location'] = lncRNA_mRNA['Location'].str.replace('PreimplantationEmbryo','preimplantation embryo', regex=False)
lncRNA_mRNA['Location'] = lncRNA_mRNA['Location'].str.replace('CellDifferentiation','cell differentiation', regex=False)
lncRNA_mRNA['Location'] = lncRNA_mRNA['Location'].str.replace('SubcellularLocation','subcellular location', regex=False)
lncRNA_mRNA['Location'] = lncRNA_mRNA['Location'].str.replace('Exosome','exosome', regex=False)
lncRNA_mRNA['Location'] = lncRNA_mRNA['Location'].str.replace('CancerCellLine','cancer cell line', regex=False)
lncRNA_mRNA['Location'] = lncRNA_mRNA['Location'].str.replace('VirusInfection','virus infection', regex=False)
lncRNA_mRNA['Location'] = lncRNA_mRNA['Location'].str.replace('Circadian','circadian rythm', regex=False)

RNA_RNA_7 = lncRNA_mRNA.drop(columns=['context','breadth']).drop_duplicates()
RNA_RNA_7.head(n=3)

* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
!wget -O https://www.dropbox.com/s/gpt1yrwoe1h2gx7/miRNet-mir-sncRNA.csv?dl=0 -P ../resources/processed_data/unprocessed_data/miRNet-mir-sncRNA.csv?dl=0

In [None]:
snoRNA_miRNA2 = pd.read_csv(unprocessed_data_location + 'miRNet-mir-sncRNA.csv?dl=0')
snoRNA_miRNA2 = snoRNA_miRNA2[snoRNA_miRNA2.gene_name.str.contains('small nucleolar')]
snoRNA_miRNA2.drop(columns=['mirnet','mir_id','entrez','symbol','gene_name','mbv'],inplace=True)
snoRNA_miRNA2 = pd.merge(snoRNA_miRNA2, rnacentral_map_human_ensembl.rename(
    columns={'Ensembl Gene ID':'embl'}), on='embl').drop(columns=['embl','Ensembl transcript ID']).rename(columns={'RNAcentral ID':'RNA1'})
snoRNA_miRNA2['Source'] = 'miRNet'

print(all(snoRNA_miRNA2['mir_acc'].isin(rnacentral_map_human_mirbase['miRBase ID'])))
snoRNA_miRNA2 = pd.merge(snoRNA_miRNA2, rnacentral_map_human_mirbase.rename(columns={'miRBase ID':'mir_acc'}),
                          on='mir_acc').drop(columns=['mir_acc']).rename(columns={'RNAcentral ID':'RNA2'})

snoRNA_miRNA2.rename(columns={'RNA1':':START_ID','RNA2':':END_ID'},inplace=True)
snoRNA_miRNA2.head(n=3)

In [None]:
!wget https://www.dropbox.com/s/80gvign9866s5na/miRNet-mir-lncRNA.csv?dl=0 -O ../resources/processed_data/unprocessed_data/miRNet-mir-lncRNA.csv?dl=0

In [None]:
miRNA_lncRNA = pd.read_csv(unprocessed_data_location + "miRNet-mir-lncRNA.csv?dl=0")
miRNA_lncRNA.drop(columns=['mirnet','symbol','mbv','entrez','gene_name'],inplace=True)

print(all(miRNA_lncRNA['mir_acc'].isin(rnacentral_map_human_mirbase['miRBase ID'])))
miRNA_lncRNA = pd.merge(miRNA_lncRNA, rnacentral_map_human_mirbase.rename(
    columns={'miRBase ID':'mir_acc'}), on='mir_acc').drop(columns=['mir_acc','mir_id']).rename(columns={'RNAcentral ID':'RNA1'})

miRNA_lncRNA['Source'] = 'miRNet'

print(all(miRNA_lncRNA['embl'].isin(rnacentral_map_human_ensembl['Ensembl Gene ID'])))
miRNA_lncRNA_rnacentral = pd.merge(miRNA_lncRNA, rnacentral_map_human_ensembl.rename(columns={'Ensembl Gene ID':'embl'}),
                                   on='embl').drop(columns=['Ensembl transcript ID']).rename(columns={'RNAcentral ID':'RNA2'})
miRNA_lncRNA_ensembl = miRNA_lncRNA[~miRNA_lncRNA['embl'].isin(miRNA_lncRNA_rnacentral['embl'])]

miRNA_lncRNA_ensembl = miRNA_lncRNA_ensembl.merge(ensembl_map, left_on='embl', right_on='ensembl_gene_id')
miRNA_lncRNA_ensembl = miRNA_lncRNA_ensembl[miRNA_lncRNA_ensembl['ensembl_gene_type'] == 'lncRNA'][[
    'RNA1','transcript_stable_id','Source']].rename(columns={'transcript_stable_id':'RNA2'})
miRNA_lncRNA_rnacentral.drop(columns=['embl'],inplace=True)

print(miRNA_lncRNA_ensembl.head(n=3))
print(miRNA_lncRNA_rnacentral.head(n=3))
miRNA_lncRNA = pd.concat([miRNA_lncRNA_rnacentral,miRNA_lncRNA_ensembl])
miRNA_lncRNA.rename(columns={'RNA1':':START_ID','RNA2':':END_ID'},inplace=True)
miRNA_lncRNA.head(n=3)

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoDB = pd.read_csv(unprocessed_data_location + 'download_all', sep="\t")
snoDB = snoDB[['rna_central_id','host_gene_id','rrna_targets','snrna_targets','lncrna_targets','protein_coding_targets','snorna_targets',
               'mirna_targets','trna_targets','ncrna_targets','pseudogene_targets','other_targets','is_expressed']]
snoDB = snoDB[snoDB['rna_central_id'].notna()]
snoDB = snoDB.rename(columns={'rna_central_id':':START_ID'})
for col in snoDB.columns:
    snoDB[col] = snoDB[col].astype(str).str.split(';')
for col in snoDB.columns:
    snoDB = snoDB.explode(col)
snoDB.fillna('nan', inplace=True)   
snoDB['Source'] = 'snoDB'
snoDB.head(n=3)

In [None]:
snoRNA_miRNA = snoDB[[':START_ID', 'mirna_targets', 'Source']]
snoRNA_miRNA = snoRNA_miRNA[snoRNA_miRNA['mirna_targets'] != 'nan']
snoRNA_miRNA['mirna_targets'] = snoRNA_miRNA['mirna_targets'].str.split('.').str[0]

snoRNA_miRNA = pd.merge(snoRNA_miRNA, rnacentral_map_human_hgnc.drop(columns=['HGNC ID']).drop_duplicates().rename(
    columns={'HGNC symbol':'mirna_targets'}), on = 'mirna_targets').drop(columns=['mirna_targets','RNA category']).rename(
        columns={'RNAcentral ID':':END_ID'})

snoRNA_miRNA.head(n=3)

In [None]:
snoRNA_snoRNA = snoDB[[':START_ID', 'snorna_targets', 'Source']]
snoRNA_snoRNA = snoRNA_snoRNA[snoRNA_snoRNA['snorna_targets'] != 'nan']
snoRNA_snoRNA['snorna_targets'] = snoRNA_snoRNA['snorna_targets'].str.split('.').str[0]

snoRNA_snoRNA = pd.merge(snoRNA_snoRNA, rnacentral_map_human_hgnc.drop(columns=['HGNC ID']).drop_duplicates().rename(
    columns={'HGNC symbol':'snorna_targets'}), on = 'snorna_targets').drop(columns=['snorna_targets']).rename(columns={'RNAcentral ID':':END_ID'})

snoRNA_snoRNA.head(n=3)

In [None]:
snoRNA_lncRNA = snoDB[[':START_ID', 'lncrna_targets', 'Source']]
snoRNA_lncRNA = snoRNA_lncRNA[snoRNA_lncRNA['lncrna_targets'] != 'nan']
snoRNA_lncRNA['lncrna_targets'] = snoRNA_lncRNA['lncrna_targets'].str.split('.').str[0]

snoRNA_lncRNA = pd.merge(snoRNA_lncRNA, rnacentral_map_human_hgnc.drop(columns=['HGNC ID']).drop_duplicates().rename(
    columns={'HGNC symbol':'lncrna_targets'}), on = 'lncrna_targets').drop(columns=['lncrna_targets']).rename(columns={'RNAcentral ID':':END_ID'})

snoRNA_lncRNA.head(n=3)

In [None]:
snoRNA_snRNA = snoDB[[':START_ID','snrna_targets', 'Source']]
snoRNA_snRNA = snoRNA_snRNA[snoRNA_snRNA['snrna_targets'] != 'nan']
snoRNA_snRNA['snrna_targets'] = snoRNA_snRNA['snrna_targets'].str.split('.').str[0]

snoRNA_snRNA = pd.merge(snoRNA_snRNA, rnacentral_map_human_hgnc.drop(columns=['HGNC ID']).drop_duplicates().rename(
    columns={'HGNC symbol':'snrna_targets'}), on = 'snrna_targets').drop(columns=['snrna_targets','RNA category']).rename(
        columns={'RNAcentral ID':':END_ID'})

snoRNA_snRNA.head(n=3)

In [None]:
snoRNA_rRNA = snoDB[[':START_ID','rrna_targets', 'Source']]
snoRNA_rRNA = snoRNA_rRNA[snoRNA_rRNA['rrna_targets'] != 'nan']
snoRNA_rRNA['rrna_targets'] = snoRNA_rRNA['rrna_targets'].str.split('.').str[0]

snoRNA_rRNA_rnacentral = pd.merge(snoRNA_rRNA, rnacentral_map_human_hgnc.drop(columns=['HGNC ID']).drop_duplicates().rename(
    columns={'HGNC symbol':'rrna_targets'}), on = 'rrna_targets').drop(columns=['rrna_targets','RNA category']).rename(
        columns={'RNAcentral ID':':END_ID'})
snoRNA_rRNA_snodb = snoRNA_rRNA[~snoRNA_rRNA['rrna_targets'].isin(rnacentral_map_human_hgnc['HGNC symbol'])] # snoDB's rRNA IDs
snoRNA_rRNA_snodb.rename(columns={'rrna_targets':':END_ID'},inplace=True)

print(snoRNA_rRNA_snodb.head(n=3))
print(snoRNA_rRNA_rnacentral.head(n=3))
snoRNA_rRNA = pd.concat([snoRNA_rRNA_rnacentral,snoRNA_rRNA_snodb])
snoRNA_rRNA.head(n=3)

In [None]:
snoRNA_tRNA = snoDB[[':START_ID','trna_targets', 'Source']]
snoRNA_tRNA = snoRNA_tRNA[snoRNA_tRNA['trna_targets'] != 'nan']
snoRNA_tRNA['trna_targets'] = snoRNA_tRNA['trna_targets'].str.split('_').str[0]

snoRNA_tRNA_rnacentral = pd.merge(snoRNA_tRNA, rnacentral_map_human_hgnc.drop(columns=['HGNC ID']).drop_duplicates().rename(
    columns={'HGNC symbol':'trna_targets'}), on = 'trna_targets').drop(columns=['trna_targets','RNA category']).rename(
        columns={'RNAcentral ID':':END_ID'})
snoRNA_tRNA_rnacentral.head(n=3)

In [None]:
snoRNA_ncRNA = snoDB[[':START_ID','ncrna_targets', 'Source']]
snoRNA_other = snoDB[[':START_ID','other_targets', 'Source']].rename(columns={'other_targets':'ncrna_targets'})
snoRNA_ncRNA = pd.concat([snoRNA_ncRNA,snoRNA_other])
snoRNA_ncRNA = snoRNA_ncRNA[snoRNA_ncRNA['ncrna_targets'] != 'nan']
snoRNA_ncRNA['ncrna_targets'] = snoRNA_ncRNA['ncrna_targets'].str.split('_').str[0]

snoRNA_ncRNA = pd.merge(snoRNA_ncRNA, rnacentral_map_human_hgnc.drop(columns=['HGNC ID']).drop_duplicates().rename(
    columns={'HGNC symbol':'ncrna_targets'}), on = 'ncrna_targets').drop(columns=['ncrna_targets','RNA category']).rename(
        columns={'RNAcentral ID':':END_ID'})

snoRNA_ncRNA.head(n=3)

* [tsRFun](https://rna.sysu.edu.cn/tsRFun/index.php) <br /> tsRFun is a platform for tsRNA functions by High-throughput Small RNA-Seq and CLIP-Seq Data.

In [None]:
!wget https://rna.sysu.edu.cn/tsRFun/download/tsRNetwork/all_hypgm_df.txt -O ../resources/processed_data/unprocessed_data/all_hypgm_df.txt

In [None]:
tsRNA_miRNA = pd.read_csv(unprocessed_data_location + 'all_hypgm_df.txt', sep="\t")  
# We consider pairs with FDR < 0.01
tsRNA_miRNA = tsRNA_miRNA[tsRNA_miRNA['adj.p'] < 0.01]
# We also remove unadjusted p-val column since we have FDR
tsRNA_miRNA.drop('p', axis=1, inplace=True)
tsRNA_miRNA.rename(columns={'adj.p':'FDR'}, inplace=True)

print(all(tsRNA_miRNA['miRNA'].isin(rnacentral_map_human['DB Description'])))
tsRNA_miRNA = pd.merge(tsRNA_miRNA, rnacentral_map_human.rename(columns={'DB Description':'miRNA'}),
                          on='miRNA').drop(columns=['DB','DB ID','miRNA','Organism','RNA category']).rename(
                              columns={'RNAcentral ID':':START_ID'})
tsRNA_miRNA = pd.merge(tsRNA_miRNA, tsRNA_map[['tRNA','tsRNAid']].rename(columns={'tRNA':'tsRNA'}), on=['tsRNA']).rename(
    columns={'tsRNAid':':END_ID'})

tsRNA_miRNA['Source'] = 'tsRFun'
tsRNA_miRNA = tsRNA_miRNA.drop_duplicates()
tsRNA_miRNA.head(n=3)

* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/) <br />  LncRNAWiki is devoted to community curation of human long non-coding RNAs (lncRNAs) to provide a comprehensive and up-to-date resource of functionally annotated lncRNAs. It incorporates a comprehensive collection of experimentally studied lncRNAs and integrates a wealth of their annotations based on a standardized curation model, and improves curation quality through expert curator review and community error report. 

In [None]:
!wget https://ngdc.cncb.ac.cn/lncrnawiki/file/LncRNAWiki_BrowseDownload.csv -O ../resources/processed_data/unprocessed_data/LncRNAWiki_BrowseDownload.csv

In [None]:
LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
miRNA_lncRNA2 = LncRNAWiki[LncRNAWiki['target_type'].notna()]
miRNA_lncRNA2 = miRNA_lncRNA2[miRNA_lncRNA2.target_type.str.contains('miRNA')]
miRNA_lncRNA2 = miRNA_lncRNA2[miRNA_lncRNA2['target_type'].notna()]
miRNA_lncRNA2.target = 'hsa-'+miRNA_lncRNA2.target
miRNA_lncRNA2.drop(columns=['symbol','synonyms','gene_locus','gene_id','conservation_ortholog','biological_process','modification_detail','pathway',
                            'conservation_species','target_interaction','description','conservation',
                            'biological_context', 'genome_variation','variation_detail','epigenetic_modification',
                            'expression','regulator_type','regulator_interaction','target_type','molecular_function',
                            'regulator','regulator_effect','target_effect','functional_mechanism','clinical_detail','expression_detail'
                            ],inplace=True)
miRNA_lncRNA2['transcript_id'] = miRNA_lncRNA2['transcript_id'].str.split(',')
miRNA_lncRNA2 = miRNA_lncRNA2.explode('transcript_id')
miRNA_lncRNA2 = miRNA_lncRNA2[miRNA_lncRNA2['transcript_id'].notna()]

miRNA_lncRNA2 = pd.merge(miRNA_lncRNA2, rnacentral_map_human_lncbook.rename(columns={'LncBook Transcript ID':'transcript_id'}),
                          on = 'transcript_id').drop(columns=['transcript_id','LncBook Gene ID']).rename(columns={'RNAcentral ID':'RNA2'})

print(all(miRNA_lncRNA2['target'].isin(rnacentral_map_human['DB Description'])))
miRNA_RNA_miRNAnotInRNAcentral3p = miRNA_lncRNA2[~miRNA_lncRNA2['target'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = miRNA_lncRNA2[~miRNA_lncRNA2['target'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['target'] = miRNA_RNA_miRNAnotInRNAcentral5p['target'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['target'] = miRNA_RNA_miRNAnotInRNAcentral3p['target'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral,
                                          rnacentral_map_human[['DB Description','RNAcentral ID']].drop_duplicates().rename(
                                              columns={'DB Description':'target'}), on='target').drop(
                                                  columns=['target']).rename(columns={'RNAcentral ID':'RNA1'})

miRNA_lncRNA2 = pd.merge(miRNA_lncRNA2, rnacentral_map_human[['DB Description','RNAcentral ID']].drop_duplicates().rename(
    columns={'DB Description':'target'}), on='target').drop(
        columns=['target']).rename(columns={'RNAcentral ID':'RNA1'})
miRNA_lncRNA2 = pd.concat([miRNA_lncRNA2, miRNA_RNA_miRNAnotInRNAcentral]).drop_duplicates()

miRNA_lncRNA2['pmid'] = pd.to_numeric(miRNA_lncRNA2['pmid'], errors='coerce')
miRNA_lncRNA2['pmid'] = miRNA_lncRNA2['pmid'].astype(str)
miRNA_lncRNA2['pmid'] = miRNA_lncRNA2['pmid'].str.replace(".0", "")
miRNA_lncRNA2['pmid'] = miRNA_lncRNA2['pmid'].replace("<NA>", np.nan)

miRNA_lncRNA2['drug'] = miRNA_lncRNA2['drug'].str.lower().str.split(";")
miRNA_lncRNA2 = miRNA_lncRNA2.explode('drug')

miRNA_lncRNA2['experimental_method'] = miRNA_lncRNA2['experimental_method'].str.lower().str.split(";")
miRNA_lncRNA2 = miRNA_lncRNA2.explode('experimental_method')
miRNA_lncRNA2 = pd.merge(miRNA_lncRNA2, method_map, right_on='0_y', left_on='experimental_method', how='left')
miRNA_lncRNA2['0_x'] = miRNA_lncRNA2['0_x'].fillna(miRNA_lncRNA2['experimental_method'])
miRNA_lncRNA2 = miRNA_lncRNA2.drop(columns=['0_y', 'experimental_method'])
miRNA_lncRNA2 = miRNA_lncRNA2.rename(columns={'0_x':'Method','pmid':'PubMedID', 'drug':'Drug', 'regulator':'Regulator', 'target':'Interactor'})

miRNA_lncRNA2['tissue/cell line'] = miRNA_lncRNA2['tissue/cell line'].str.lower().str.split(";")
miRNA_lncRNA2 = miRNA_lncRNA2.explode('tissue/cell line')
miRNA_lncRNA2 = pd.merge(miRNA_lncRNA2, location_map, right_on='0_y', left_on='tissue/cell line', how='left')
miRNA_lncRNA2['0_x'] = miRNA_lncRNA2['0_x'].fillna(miRNA_lncRNA2['tissue/cell line'])
miRNA_lncRNA2 = miRNA_lncRNA2.drop(columns=['0_y', 'tissue/cell line'])
miRNA_lncRNA2 = miRNA_lncRNA2.rename(columns={'0_x':'Location'})

miRNA_lncRNA2['context_detail'] = miRNA_lncRNA2['context_detail'].str.lower().str.split(";")
miRNA_lncRNA2 = miRNA_lncRNA2.explode('context_detail')
miRNA_lncRNA2 = pd.merge(miRNA_lncRNA2, disease_map, right_on='0_y', left_on='context_detail', how='left')
miRNA_lncRNA2['0_x'] = miRNA_lncRNA2['0_x'].fillna(miRNA_lncRNA2['context_detail'])
miRNA_lncRNA2 = miRNA_lncRNA2.drop(columns=['0_y', 'context_detail'])
miRNA_lncRNA2 = miRNA_lncRNA2.rename(columns={'0_x':'Location2'})

miRNA_lncRNA2 = pd.concat([miRNA_lncRNA2.drop(columns=['Location2']), miRNA_lncRNA2.drop(columns=['Location']).rename(columns={'Location2':'Location'})])

miRNA_lncRNA2['Source'] = 'LncRNAWiki'
miRNA_lncRNA2.rename(columns={'RNA1':':START_ID','RNA2':':END_ID'},inplace=True)
RNA_RNA3 = miRNA_lncRNA2.copy().drop_duplicates()
RNA_RNA3.head(n=3)

* [SomamiR](https://compbio.uthsc.edu/SomamiR/)

In [None]:
!wget https://compbio.uthsc.edu/SomamiR/download/lncRNA_somatic_v2.0.txt.tar.gz -O ../resources/processed_data/unprocessed_data/lncRNA_somatic_v2.0.txt.tar.gz

In [None]:
miRNA_lncRNA2 = pd.read_csv(unprocessed_data_location+'lncRNA_somatic_v2.0.txt.tar.gz',sep='\t',dtype={'PMID':str})
miRNA_lncRNA2.drop(columns=['Unnamed: 18'],inplace=True)
miRNA_lncRNA2.rename(columns={'lncRNA_somatic_v2.0.txt':'Gene'},inplace=True)
miRNA_lncRNA2['Gene'] = miRNA_lncRNA2['Gene'].str.replace(r'lnc-', '')
miRNA_lncRNA2.rename(columns={'Gene': 'symbol', 'miRNA':'mir_id'}, inplace=True)
miRNA_lncRNA2 = miRNA_lncRNA2[miRNA_lncRNA2['TargetScan_Site(0=No;1=Yes)'] == 1]

lncpedia_map = pd.read_csv("https://lncipedia.org/downloads/lncipedia_5_0/lncipedia_5_0_vs_5_2.txt", sep='\t')
miRNA_lncRNA2 = miRNA_lncRNA2.merge(lncpedia_map.rename(columns={'LNCipedia 5.0 Transcript ID':'Transcript'}),
                                    on='Transcript').drop(columns=['Transcript']).rename(
                                        columns={'LNCipedia 5.2 Transcript ID':'Transcript'})
miRNA_lncRNA2 = pd.merge(rnacentral_map_human_lncipedia.rename(columns={'LNCipedia transcript ID':'Transcript'}),
                         miRNA_lncRNA2, on='Transcript').drop(columns=['Transcript','LNCipedia Gene ID']).rename(
                             columns={'RNAcentral ID':'RNA2'})

miRNA_lncRNA2 = miRNA_lncRNA2.drop(columns=['symbol','Chromosome','Location','Ref_Allele','Sample_Name',
                                            'Mut_Allele','FuncClass','Alteration','Target_Site',
                                            'Seed','SeedClass', 'TargetScan_Site(0=No;1=Yes)'])

print(all(miRNA_lncRNA2['mir_id'].isin(rnacentral_map_human['DB Description'])))
print(miRNA_lncRNA2[~miRNA_lncRNA2['mir_id'].isin(rnacentral_map_human['DB Description'])]['mir_id'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = miRNA_lncRNA2[~miRNA_lncRNA2['mir_id'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = miRNA_lncRNA2[~miRNA_lncRNA2['mir_id'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['mir_id'] = miRNA_RNA_miRNAnotInRNAcentral5p['mir_id'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['mir_id'] = miRNA_RNA_miRNAnotInRNAcentral3p['mir_id'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human[['DB Description','RNAcentral ID']].drop_duplicates().rename(
    columns={'DB Description':'mir_id'}), on='mir_id').drop(columns=['mir_id']).rename(
        columns={'RNAcentral ID':'RNA1'})

miRNA_lncRNA2 = pd.merge(miRNA_lncRNA2, rnacentral_map_human[['DB Description','RNAcentral ID']].drop_duplicates().rename(
    columns={'DB Description':'mir_id'}), on='mir_id').drop(
        columns=['mir_id']).rename(columns={'RNAcentral ID':'RNA1'})
miRNA_lncRNA2 = pd.concat([miRNA_lncRNA2, miRNA_RNA_miRNAnotInRNAcentral]).drop_duplicates()
miRNA_lncRNA2 = miRNA_lncRNA2[(miRNA_lncRNA2['RNA1'].notna()) & (miRNA_lncRNA2['RNA2'].notna())]

miRNA_lncRNA2['PMID'] = pd.to_numeric(miRNA_lncRNA2['PMID'], errors='coerce')
miRNA_lncRNA2['PMID'] = miRNA_lncRNA2['PMID'].astype(str)
miRNA_lncRNA2['PMID'] = miRNA_lncRNA2['PMID'].str.replace(".0", "")
miRNA_lncRNA2['PMID'] = miRNA_lncRNA2['PMID'].replace("<NA>", np.nan)

miRNA_lncRNA2['Cancer_Class'] = miRNA_lncRNA2['Cancer_Class'].str.lower()
miRNA_lncRNA2['Cancer_Class'] = miRNA_lncRNA2['Cancer_Class'].str.replace('[ns]','')
miRNA_lncRNA2['Cancer_Class'] = miRNA_lncRNA2['Cancer_Class'].str.replace('_',' ')
miRNA_lncRNA2['Cancer_Class'] = miRNA_lncRNA2['Cancer_Class'].astype(str).apply(
     lambda x: ' '.join(re.findall(r'\[(.*?)\]', x)) if '[' in x and ']' in x else x)
miRNA_lncRNA2 = miRNA_lncRNA2.explode('Cancer_Class')
miRNA_lncRNA2 = pd.merge(miRNA_lncRNA2, location_map, right_on='0_y', left_on='Cancer_Class', how='left')
miRNA_lncRNA2['0_x'] = miRNA_lncRNA2['0_x'].fillna(miRNA_lncRNA2['Cancer_Class'])
miRNA_lncRNA2 = miRNA_lncRNA2.drop(columns=['0_y', 'Cancer_Class'])
miRNA_lncRNA2 = miRNA_lncRNA2.rename(columns={'0_x':'Location'})
miRNA_lncRNA2['Location'] = miRNA_lncRNA2['Location'].replace("<NA>", np.nan)

miRNA_lncRNA2['Source'] = 'SomamiR'
miRNA_lncRNA3 = miRNA_lncRNA2.copy()
miRNA_lncRNA3['Source'] = 'TargetScan'
miRNA_lncRNA2 = pd.concat([miRNA_lncRNA2, miRNA_lncRNA3])
RNA_RNA4 = miRNA_lncRNA2.rename(columns={'RNA1':':START_ID','RNA2':':END_ID','COSMIC_ID':'Mutation','PMID':'PubMedID'}).drop_duplicates()
RNA_RNA4.head(n=3)

In [None]:
!wget https://compbio.uthsc.edu/SomamiR/download/circRNA_somatic_v2.0.txt.tar.gz -O ../resources/processed_data/unprocessed_data/circRNA_somatic_v2.0.txt.tar.gz

In [None]:
circRNA_miRNA = pd.read_csv(unprocessed_data_location + 'circRNA_somatic_v2.0.txt.tar.gz', sep="\t")
circRNA_miRNA = circRNA_miRNA[circRNA_miRNA['TargetScan_Site(0=No;1=Yes)'] == 1]
circRNA_miRNA = circRNA_miRNA.drop(columns=['Gene','Mut_ID','Chromosome','Location','Mut_Allele','FuncClass','Alteration','Ref_Allele',
                                            'Target_Site','Seed','SeedClass','TargetScan_Site(0=No;1=Yes)','Sample_Name','Unnamed: 18'])

circRNA_miRNA['Transcript'] = circRNA_miRNA['Transcript'].str.lower()
circRNA_miRNA = circRNA_miRNA[circRNA_miRNA['Transcript'].str.match(r'hsa_circ_\d+')]

print(all(circRNA_miRNA['miRNA'].isin(rnacentral_map_human['DB Description'])))
print(circRNA_miRNA[~circRNA_miRNA['miRNA'].isin(rnacentral_map_human['DB Description'])]['miRNA'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = circRNA_miRNA[~circRNA_miRNA['miRNA'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = circRNA_miRNA[~circRNA_miRNA['miRNA'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['miRNA'] = miRNA_RNA_miRNAnotInRNAcentral5p['miRNA'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['miRNA'] = miRNA_RNA_miRNAnotInRNAcentral3p['miRNA'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'miRNA'}), on='miRNA').drop(columns=['miRNA']).rename(columns={'RNAcentral ID':'miRNA'})
circRNA_miRNA = pd.merge(circRNA_miRNA, rnacentral_map_human[['DB Description','RNAcentral ID']].drop_duplicates().rename(
    columns={'DB Description':'miRNA'}), on='miRNA').drop(
        columns=['miRNA']).rename(columns={'RNAcentral ID':'miRNA'})
circRNA_miRNA = pd.concat([circRNA_miRNA, miRNA_RNA_miRNAnotInRNAcentral]).rename(columns={'miRNA':'RNA1','Transcript':'RNA2'})

circRNA_miRNA['PMID'] = pd.to_numeric(circRNA_miRNA['PMID'], errors='coerce')
circRNA_miRNA['PMID'] = circRNA_miRNA['PMID'].astype(str)
circRNA_miRNA['PMID'] = circRNA_miRNA['PMID'].str.replace(".0", "")
circRNA_miRNA['PMID'] = circRNA_miRNA['PMID'].replace("<NA>", np.nan)

circRNA_miRNA['Cancer_Class'] = circRNA_miRNA['Cancer_Class'].str.lower()
circRNA_miRNA['Cancer_Class'] = circRNA_miRNA['Cancer_Class'].str.replace('[ns]','')
circRNA_miRNA['Cancer_Class'] = circRNA_miRNA['Cancer_Class'].str.replace('_',' ')
circRNA_miRNA['Cancer_Class'] = circRNA_miRNA['Cancer_Class'].astype(str).apply(
     lambda x: ' '.join(re.findall(r'\[(.*?)\]', x)) if '[' in x and ']' in x else x)
circRNA_miRNA = circRNA_miRNA.explode('Cancer_Class')
circRNA_miRNA = pd.merge(circRNA_miRNA, location_map, right_on='0_y', left_on='Cancer_Class', how='left')
circRNA_miRNA['0_x'] = circRNA_miRNA['0_x'].fillna(circRNA_miRNA['Cancer_Class'])
circRNA_miRNA = circRNA_miRNA.drop(columns=['0_y', 'Cancer_Class'])
circRNA_miRNA = circRNA_miRNA.rename(columns={'0_x':'Location'})
circRNA_miRNA['Location'] = circRNA_miRNA['Location'].replace("<NA>", np.nan)

circRNA_miRNA['Source'] = 'SomamiR'
circRNA_miRNA2 = circRNA_miRNA.copy()
circRNA_miRNA2['Source'] = 'TargetScan'
circRNA_miRNA = pd.concat([circRNA_miRNA, circRNA_miRNA2])
circRNA_miRNA = circRNA_miRNA[circRNA_miRNA['RNA2'].str.startswith('hsa_circ_')]
RNA_RNA8 = circRNA_miRNA.rename(columns={'RNA2':':START_ID','RNA1':':END_ID','COSMIC_ID':'Mutation','PMID':'PubMedID'}).drop_duplicates()
RNA_RNA8.head(n=3)

* [LncBase](https://diana.e-ce.uth.gr/lncbasev3/home) <br />  DIANA-LncBase v3 is a reference repository with experimentally supported miRNA targets on long non-coding transcripts.

In [None]:
! wget -O https://dianalab.e-ce.uth.gr/downloads/lncbase_v2_pred_data.tar.gz -P ../resources/processed_data/unprocessed_data/
import tarfile
with tarfile.open(unprocessed_data_location + 'lncbase_v2_pred_data.tar.gz', 'r:gz') as tar:
    tar.extractall(unprocessed_data_location)

In [None]:
mirna_lncrna2 = pd.read_csv(unprocessed_data_location+"lncBaseV2_predicted_human_data.csv").drop(columns=['Gene_ID(Gene_Name)'])

# From LncBase: high-confidence miTG score is >= 0.5
mirna_lncrna2["miTG-score"] = pd.to_numeric(mirna_lncrna2["miTG-score"], errors='coerce')
mirna_lncrna2 = mirna_lncrna2[mirna_lncrna2["miTG-score"] >= 0.5]
mirna_lncrna2['#Transcript_ID'] = mirna_lncrna2['#Transcript_ID'].str.replace('>', '')
mirna_lncrna2['Mirna_Name(miRBase_version)'] = mirna_lncrna2['Mirna_Name(miRBase_version)'].str.replace(r'\(.*\)', '', regex=True)
mirna_lncrna2['Source'] = 'LncBase'

print(all(mirna_lncrna2['Mirna_Name(miRBase_version)'].isin(rnacentral_map_human['DB Description'])))
print(mirna_lncrna2[~mirna_lncrna2['Mirna_Name(miRBase_version)'].isin(rnacentral_map_human['DB Description'])][
    'Mirna_Name(miRBase_version)'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = mirna_lncrna2[~mirna_lncrna2['Mirna_Name(miRBase_version)'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = mirna_lncrna2[~mirna_lncrna2['Mirna_Name(miRBase_version)'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['Mirna_Name(miRBase_version)'] = miRNA_RNA_miRNAnotInRNAcentral5p[
    'Mirna_Name(miRBase_version)'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['Mirna_Name(miRBase_version)'] = miRNA_RNA_miRNAnotInRNAcentral3p[
    'Mirna_Name(miRBase_version)'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human[['DB Description','RNAcentral ID']].drop_duplicates().rename(
    columns={'DB Description':'Mirna_Name(miRBase_version)'}), on='Mirna_Name(miRBase_version)').drop(columns=[
        'Mirna_Name(miRBase_version)']).rename(columns={'RNAcentral ID':'RNA1'})

mirna_lncrna2 = pd.merge(mirna_lncrna2, rnacentral_map_human[['DB Description','RNAcentral ID']].drop_duplicates().rename(
    columns={'DB Description':'Mirna_Name(miRBase_version)'}),on='Mirna_Name(miRBase_version)').drop(columns=['Mirna_Name(miRBase_version)']
                                                                                                     ).rename(columns={'RNAcentral ID':'RNA1'})
mirna_lncrna2 = pd.concat([mirna_lncrna2, miRNA_RNA_miRNAnotInRNAcentral]).drop_duplicates()

mirna_lncrna2 = mirna_lncrna2[(mirna_lncrna2['RNA1'].notna()) & (mirna_lncrna2["#Transcript_ID"].notna())]
print(all(mirna_lncrna2['#Transcript_ID'].isin(rnacentral_map_human_ensembl['Ensembl transcript ID'])))
print(mirna_lncrna2[~mirna_lncrna2['#Transcript_ID'].isin(rnacentral_map_human_ensembl['Ensembl transcript ID'])][
    '#Transcript_ID'].str[:3].unique())
mirna_lncrna2['#Transcript_ID'] = mirna_lncrna2['#Transcript_ID'].str.split(".").str[0]
mirna_lncrna2_rnacentral = mirna_lncrna2[mirna_lncrna2['#Transcript_ID'].str.startswith('ENST')].rename(columns={"#Transcript_ID":'RNA2'})
print(all(mirna_lncrna2['#Transcript_ID'].isin(rnacentral_map_human_ensembl['Ensembl transcript ID'])))
mirna_lncrna2_rnacentral = pd.merge(mirna_lncrna2, rnacentral_map_human_ensembl.rename(columns={"Ensembl transcript ID":'#Transcript_ID'}),
                                     on='#Transcript_ID').drop(columns=['Ensembl Gene ID']).rename(
                                         columns={'RNAcentral ID':'RNA2'})
mirna_lncrna2_ensembl = mirna_lncrna2[~(mirna_lncrna2['#Transcript_ID'].isin(
    rnacentral_map_human_ensembl['Ensembl transcript ID'])) & (mirna_lncrna2['#Transcript_ID'].str.startswith('ENST'))].rename(
        columns={"#Transcript_ID":'RNA2'})
mirna_lncrna2_rnacentral.drop(columns=['#Transcript_ID'],inplace=True)
print(mirna_lncrna2_ensembl.head(n=3))
mirna_lncrna2_rnacentral.head(n=3)

RNA_RNA5 = pd.concat([mirna_lncrna2_rnacentral,mirna_lncrna2_ensembl]).drop_duplicates()
RNA_RNA5.rename(columns={'RNA1':':START_ID','RNA2':':END_ID', 'miTG-score':'miTG_score'},inplace=True)
RNA_RNA5.head(n=3)

In [None]:
RNA_interacts_with_RNA = pd.concat([RNA_RNA5, RNA_RNA8, RNA_RNA4, RNA_RNA3, tsRNA_miRNA, snoRNA_ncRNA, snoRNA_tRNA_rnacentral,
                                    snoRNA_rRNA, snoRNA_snRNA, snoRNA_lncRNA, snoRNA_snoRNA, snoRNA_miRNA, miRNA_lncRNA, snoRNA_miRNA2,
                                    RNA_RNA_7, RNA_RNA6, RNA_RNA2, RNA_RNA]).drop(columns=['DB','DB ID','Organism','RNA category','Mut_ID','tsRNA'])
RNA_interacts_with_RNA = RNA_interacts_with_RNA.groupby([':START_ID',':END_ID']).agg({
    'miTG_score': np.mean,
    'Source': set,
    'Mutation': set,
    'PubMedID': set,
    'Location': set,
    'Drug': set,
    'Method': set,
    'FDR': np.mean,
    'Distance': np.mean,
    'RNAsister_score': np.mean}).reset_index()
RNA_interacts_with_RNA_inv = RNA_interacts_with_RNA.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
RNA_interacts_with_RNA = pd.concat([RNA_interacts_with_RNA, RNA_interacts_with_RNA_inv])
RNA_interacts_with_RNA[':TYPE'] = 'interacts_with'
RNA_interacts_with_RNA.to_pickle(unprocessed_edge_data_location+'RNA_interacts_with_RNA.pkl')
RNA_interacts_with_RNA.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002436 (molecularly interacts with) - RNA

* [LncBook](https://ngdc.cncb.ac.cn/lncbook/) <br />  LncBook accommodates a high-quality collection of 95,243 human lncRNA genes and 323,950 lncRNA transcripts, and incorporates their abundant annotations at different omics levels, thereby enabling users to decipher functional signatures of lncRNAs in human diseases and different biological contexts.

In [None]:
!wget https://ngdc.cncb.ac.cn/lncbook/files/lncrna_mirna_miRandaAndTargetScanAndRNAhybrid_LncBook2.0.csv.gz -O ../resources/processed_data/unprocessed_data/lncrna_mirna_miRandaAndTargetScanAndRNAhybrid_LncBook2.0.csv.gz

In [None]:
mirna_lncrna2 = pd.read_csv(unprocessed_data_location+"lncrna_mirna_miRandaAndTargetScanAndRNAhybrid_LncBook2.0.csv.gz", header=None)
mirna_lncrna2 = mirna_lncrna2[[0,3,4,5,6,10]]
mirna_lncrna2[3] = mirna_lncrna2[3].fillna(mirna_lncrna2[0])
mirna_lncrna2.drop(columns=[0],inplace=True)
mirna_lncrna2.columns = ['symbol', 'Binding_start', 'Binding_end', 'Minimum free energy [kcal/mol]', 'mir_id']
mirna_lncrna2['Source'] = 'LncBook'
mirna_lncrna3 = mirna_lncrna2.copy()
mirna_lncrna3['Source'] = 'TargetScan'
mirna_lncrna4 = mirna_lncrna2.copy()
mirna_lncrna4['Source'] = 'miRanda'
mirna_lncrna5 = mirna_lncrna2.copy()
mirna_lncrna5['Source'] = 'RNAhybrid'
mirna_lncrna2 = pd.concat([mirna_lncrna2, mirna_lncrna3, mirna_lncrna4, mirna_lncrna5])

print(all(mirna_lncrna2['mir_id'].isin(rnacentral_map_human['DB Description'])))
mirna_lncrna2 = pd.merge(mirna_lncrna2, rnacentral_map_human.rename(columns={'DB Description':'mir_id'}),
                          on='mir_id').drop(columns=['DB','DB ID','mir_id','Organism','RNA category']).rename(
                              columns={'RNAcentral ID':'RNA1'})

# These are few sequences without an official symbol. Therefore, we remove them
print(mirna_lncrna2[~mirna_lncrna2['symbol'].isin(rnacentral_map_human_lncbook['LncBook Transcript ID'])]['symbol'].unique()[:3])
print(mirna_lncrna2[~mirna_lncrna2['symbol'].isin(rnacentral_map_human_lncbook['LncBook Transcript ID'])]['symbol'].str[:6].unique())
mirna_lncrna2 = pd.merge(mirna_lncrna2, rnacentral_map_human_lncbook[['RNAcentral ID','LncBook Transcript ID']].drop_duplicates().rename(
    columns={'LncBook Transcript ID':'symbol'}), on='symbol').drop(columns=['symbol']).rename(columns={'RNAcentral ID':'RNA2'})

mirna_lncrna2['Binding_pos'] = mirna_lncrna2['Binding_start'].astype(str) + '-' + mirna_lncrna2['Binding_end'].astype(str)
mirna_lncrna2['Binding_pos'] = mirna_lncrna2['Binding_pos'].replace('nan-nan', np.nan)

mirna_lncrna2.rename(columns={'RNA1':':START_ID','RNA2':':END_ID','Minimum free energy [kcal/mol]':'Minimum_free_energy_kcal_mol'},inplace=True)
mirna_lncrna2.head(n=3)

In [None]:
RNA_molecularly_interacts_with_RNA_inv = mirna_lncrna2.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
RNA_molecularly_interacts_with_RNA = pd.concat([mirna_lncrna2, RNA_molecularly_interacts_with_RNA_inv])
RNA_molecularly_interacts_with_RNA = RNA_molecularly_interacts_with_RNA.groupby([':START_ID',':END_ID']).agg({
    'Minimum_free_energy_kcal_mol':np.mean,'Binding_pos':set,'Source':set}).reset_index()
RNA_molecularly_interacts_with_RNA[':TYPE'] = 'molecularly_interacts_with'
RNA_molecularly_interacts_with_RNA.to_pickle(unprocessed_edge_data_location+'RNA_molecularly_interacts_with_RNA.pkl')
RNA_molecularly_interacts_with_RNA.head(n=3)

***
### SNP - http://purl.obolibrary.org/obo/RO_0002566 (causally influences) - RNA

* ClinVar

In [None]:
# Download data
url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz'
data_downloader(url, unprocessed_data_location, 'variant_summary.txt.gz')

In [None]:
clinvar_data = pd.read_csv(unprocessed_data_location + 'variant_summary.txt', header=0,
                           delimiter='\t', low_memory=False)[['Name', 'GeneID','Assembly',
                                                              'RS# (dbSNP)','PhenotypeList','ReviewStatus']]
clinvar_data = clinvar_data[clinvar_data['GeneID'] != -1].drop(columns=['GeneID'])
clinvar_data = clinvar_data[(clinvar_data['ReviewStatus'] == "criteria provided, multiple submitters, no conflicts")
                            | (clinvar_data['ReviewStatus'] == "reviewed by expert panel")
                            | (clinvar_data['ReviewStatus'] == "practice guideline")].drop(columns=['ReviewStatus'])
clinvar_data = clinvar_data[clinvar_data['RS# (dbSNP)'] != -1]
clinvar_data['RS# (dbSNP)'] = 'rs' + clinvar_data['RS# (dbSNP)'].astype(str)
clinvar_data['Name'] = clinvar_data['Name'].str.split(".").str[0]
clinvar_data = clinvar_data[clinvar_data['Assembly'] == 'GRCh38'].drop(columns=['Assembly'])

clinvar_data['PhenotypeList'] = clinvar_data['PhenotypeList'].str.lower()
clinvar_data['PhenotypeList'] = clinvar_data['PhenotypeList'].str.split("|")
clinvar_data = clinvar_data.explode('PhenotypeList')
clinvar_data = pd.merge(clinvar_data, disease_map, right_on='0_y', left_on='PhenotypeList', how='left')
clinvar_data['0_x'] = clinvar_data['0_x'].fillna(clinvar_data['PhenotypeList'])
clinvar_data = clinvar_data.drop(columns=['0_y', 'PhenotypeList'])
clinvar_data = clinvar_data.rename(columns={'0_x':'Location'})
clinvar_data['Location'] = clinvar_data['Location'].replace("not provided", np.nan)
clinvar_data['Location'] = clinvar_data['Location'].replace("<NA>", np.nan)

clinvar_data = pd.merge(clinvar_data, rnacentral_map_human_refseq[['RNAcentral ID','RefSeq ID']].drop_duplicates().rename(
    columns={'RefSeq ID':'Name'}), on='Name')
clinvar_data = clinvar_data.rename(columns={'RNAcentral ID':':END_ID', 'RS# (dbSNP)':':START_ID'})
clinvar_data['Source'] = 'ClinVar'
clinvar_data.head(n=3)

* [PolymiRTS](https://compbio.uthsc.edu/miRSNP/home.php)

In [None]:
snp_mirna = pd.read_csv(unprocessed_data_location + 'target_miRSNP_human_CLASH.txt',sep='\t')[['refseq','microRNA_name','mutid']]

snp_mirna_5p = snp_mirna[snp_mirna['microRNA_name'].str.contains("\\*")]
snp_mirna = snp_mirna[~snp_mirna['microRNA_name'].str.contains("\\*")]
snp_mirna_3p = snp_mirna_5p.copy()
snp_mirna_5p['microRNA_name'] = snp_mirna['microRNA_name'].str.replace('*', '-5p')
snp_mirna_3p['microRNA_name'] = snp_mirna['microRNA_name'].str.replace('*', '-3p')
snp_mirna = pd.concat([snp_mirna, snp_mirna_5p, snp_mirna_3p])
snp_mirna = snp_mirna[(snp_mirna['microRNA_name'].notna()) & (snp_mirna['mutid'].notna())]
print(all(snp_mirna['microRNA_name'].isin(rnacentral_map_human['DB Description'])))
print(snp_mirna[snp_mirna['microRNA_name'].isin(rnacentral_map_human['DB Description'])]['microRNA_name'].str[:3].unique())

miRNA_RNA_miRNAnotInRNAcentral3p = snp_mirna[~snp_mirna['microRNA_name'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = snp_mirna[~snp_mirna['microRNA_name'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['microRNA_name'] = miRNA_RNA_miRNAnotInRNAcentral5p['microRNA_name'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['microRNA_name'] = miRNA_RNA_miRNAnotInRNAcentral3p['microRNA_name'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human[['RNAcentral ID','DB Description']].drop_duplicates().rename(
    columns={'DB Description':'microRNA_name'}), on='microRNA_name').drop(columns=['microRNA_name']).rename(columns={'RNAcentral ID':'RNA1'})

snp_mirna = pd.merge(snp_mirna, rnacentral_map_human[['RNAcentral ID','DB Description']].drop_duplicates().rename(
    columns={'DB Description':'microRNA_name'}), on='microRNA_name')
snp_mirna = pd.concat([snp_mirna.rename(columns={'RNAcentral ID':'RNA1'}),
                       miRNA_RNA_miRNAnotInRNAcentral]).drop(columns=['microRNA_name'])

snp_mirna = pd.merge(snp_mirna, rnacentral_map_human_refseq[['RNAcentral ID','RefSeq ID']].drop_duplicates().rename(columns={'RefSeq ID':'refseq'}),
                        on='refseq', how='left')
snp_mirna['RNAcentral ID'] = snp_mirna['RNAcentral ID'].fillna(snp_mirna['refseq'])
snp_mirna = snp_mirna.drop(columns=['refseq'])
snp_mirna = snp_mirna.rename(columns={'RNAcentral ID':'Interactor'})
snp_mirna['Source'] = 'PolymiRTS'
snp_mirna = snp_mirna.rename(columns={'mutid':':START_ID', 'RNA1':':END_ID'})
snp_mirna.head(n=3)

In [None]:
!wget https://compbio.uthsc.edu/miRSNP/download/PolymiRTS3.0/target_miRSNP_human_CLASH.txt -O ../resources/processed_data/unprocessed_data/target_miRSNP_human_CLASH.txt

In [None]:
snp_mrna = pd.read_csv(unprocessed_data_location + 'target_miRSNP_human_CLASH.txt',sep='\t')[['refseq','microRNA_name','mutid']]

snp_mrna = pd.merge(rnacentral_map_human_refseq[['RNAcentral ID','RefSeq ID']].drop_duplicates(),snp_mrna,
                                  left_on='RefSeq ID', right_on='refseq',
                                  how='right')

snp_mrna.rename(columns={'mutid':':START_ID', 'RNAcentral ID':':END_ID', 'microRNA_name':'Interactor'},inplace=True)
snp_mrna = snp_mrna[[':START_ID',':END_ID','Interactor']].drop_duplicates()
snp_mrna = snp_mrna[(~snp_mrna[':START_ID'].isna()) & (~snp_mrna[':END_ID'].isna())]
snp_mrna['Source'] = 'PolymiRTS'
snp_mrna.head(n=3)

* miRNet

In [None]:
!wget https://www.dropbox.com/s/8aq8k0yoy5ak0d6/miRNet-snpmirbs-hsa.csv?dl=0 -O ../resources/processed_data/unprocessed_data/miRNet-snpmirbs-hsa.csv?dl=0

In [None]:
gene_variant = pd.read_csv(unprocessed_data_location + "miRNet-snpmirbs-hsa.csv?dl=0")
gene_variant = gene_variant[['rsid','transcript_id']]
gene_variant = gene_variant.rename(columns={'rsid':'SNP', 'transcript_id':'RefSeq ID'})

gene_variant = pd.merge(rnacentral_map_human_refseq[['RNAcentral ID','RefSeq ID']].drop_duplicates(),gene_variant,on='RefSeq ID')

gene_variant['Source'] = 'miRNet'
gene_variant.rename(columns={'SNP':':START_ID', 'RNAcentral ID':':END_ID'},inplace=True)
gene_variant.head(n=3)

In [None]:
'''
# Since we are dealing with 350K edges, we keep only those whose SNP is already considered by other sources.
SNPnonO_data = pd.concat([pd.read_csv('../resources/edge_data/variant-piRNA2566.txt',sep='\t').rename(columns={'rs Id':'SNP'})['SNP'],
                          pd.read_csv('../resources/edge_data/variant-miRNA2566.txt',sep='\t')['SNP'],
                          pd.read_csv('../resources/edge_data/variant-premiRNA2566.txt', sep='\t')['SNP'],
                          pd.read_csv('../resources/edge_data/variant-disease3302.txt', sep='\t')['SNP'],
                          pd.read_csv('../resources/edge_data/variant-phenotype3302.txt', sep='\t')['SNP']]).drop_duplicates()

# This contains HDMM KG rsIDs
nonO_data = pd.read_pickle(r'../resources/construction_approach/'+'subclass_construction_mapORIGINAL.pkl')
nonO_data_series = pd.Series(nonO_data)

b = pd.DataFrame(nonO_data_series)
b['SNP'] = b.index

gene_variant = gene_variant[(gene_variant['SNP'].isin(SNPnonO_data)) | (gene_variant['SNP'].isin(b['SNP']))]
gene_variant
'''

* [piRBase](http://bigdata.ibp.ac.cn/piRBase/)

In [None]:
# http://bigdata.ibp.ac.cn/piRBase/variants.php -->  Display 1151 (since "Total amount: 1151") items per page --> Display button --> download html

In [None]:
variant_piRNA = pd.read_html(unprocessed_data_location + 'piRNA-variant.html')[1]
variant_piRNA = variant_piRNA[~variant_piRNA['rs Id'].isna()]
variant_piRNA = variant_piRNA[['Variants', 'rs Id']]
variant_piRNA['Variants'] = variant_piRNA['Variants'].str.replace('>', '-')
variant_piRNA.head(n=3)

In [None]:
df = pd.DataFrame()
for i in variant_piRNA.Variants.unique():
    temp = pd.read_html('http://bigdata.ibp.ac.cn/piRBase/variants2.php?organism=hsa&name=' + i)[1]
    temp = temp[['Name']]
    temp['Variants'] = i
    df = pd.concat([df, temp])

df.head(n=3)

In [None]:
#df.to_csv(unprocessed_data_location + 'piRBase_piRNA-variant.csv', index=False)
#df = pd.read_csv(unprocessed_data_location + 'piRBase_piRNA-variant.csv')

In [None]:
variant_piRNA = pd.merge(variant_piRNA, df, on=['Variants']).drop(columns=['Variants'])
variant_piRNA = variant_piRNA.rename(columns={'Name':'piRNA'})
variant_piRNA = pd.merge(variant_piRNA, rnacentral_map_human_pirbase.rename(columns={'piRBase ID':'piRNA'}), on='piRNA').drop(
    columns=['piRNA']).rename(columns={'RNAcentral ID':':END_ID', 'rs Id':':START_ID'})
variant_piRNA['Source'] = 'piRBase'
variant_piRNA.head(n=3) # Empty

* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
!wget https://www.dropbox.com/s/cu4hv35ulu3a8d6/miRNet-snp-mir-hsa.csv?dl=0 -O ../resources/processed_data/unprocessed_data/miRNet-snp-mir-hsa.csv?dl=0

In [None]:
miRNA_variant = pd.read_csv(unprocessed_data_location + "miRNet-snp-mir-hsa.csv?dl=0")
miRNA_variant = miRNA_variant[miRNA_variant['High_Confidence']=='YES']
miRNA_variant.drop(columns=['mirnet','chr_pos','Mature_Pos','MIRNA_Name','Precursor_Pos','Mature_Name','gnomAD_MAF',
                            'Robust_FANTOM5','Conserved_ADmiRE', 'Family_Name','Predicted_Motif',
                            'AF_Percentile_gnomAD','Phylop_100way','Phastcons_100way','High_Confidence','MIRNA_Domain'],
                   inplace=True)
miRNA_variant['Mature_Acc'].fillna(miRNA_variant['MIRNA_Acc'], inplace=True)
miRNA_variant.drop(columns=['MIRNA_Acc'], inplace=True)

print(all(miRNA_variant['Mature_Acc'].isin(rnacentral_map_human_mirbase['miRBase ID'])))
miRNA_variant = pd.merge(miRNA_variant, rnacentral_map_human_mirbase.rename(columns={'miRBase ID':'Mature_Acc'}),
                          on='Mature_Acc').drop(columns=['Mature_Acc']).rename(columns={'RNAcentral ID':'RNA'})

miRNA_variant['Source'] = 'miRNet'
miRNA_variant.rename(columns={'RNA':':END_ID', 'rsid':':START_ID'},inplace=True)
miRNA_variant.head(n=3)

* [miRdSNP](http://mirdsnp.ccr.buffalo.edu/)

In [None]:
!wget http://mirdsnp.ccr.buffalo.edu/downloads/mirdsnp-dsnp-generated-mir-targets-v11.03.csv -P ../resources/processed_data/unprocessed_data/mirdsnp-dsnp-generated-mir-targets-v11.03.csv

In [None]:
miRNA_variant2 = pd.read_csv(unprocessed_data_location+'mirdsnp-dsnp-generated-mir-targets-v11.03.csv')
miRNA_variant2 = miRNA_variant2[miRNA_variant2['experimentally_confirmed'].notna()]
miRNA_variant2.rename(columns={'SNP':'rsid','miR':'MIRNA_Name'},inplace=True)
miRNA_variant2 = miRNA_variant2.drop(columns=['experimentally_confirmed','gene_name','distance'])

print(miRNA_variant2)
print(any(miRNA_variant2['MIRNA_Name'].isin(rnacentral_map_human['DB Description'])))
miRNA_RNA_miRNAnotInRNAcentral3p = miRNA_variant2[~miRNA_variant2['MIRNA_Name'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = miRNA_variant2[~miRNA_variant2['MIRNA_Name'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['MIRNA_Name'] = miRNA_RNA_miRNAnotInRNAcentral5p['MIRNA_Name'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['MIRNA_Name'] = miRNA_RNA_miRNAnotInRNAcentral3p['MIRNA_Name'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_variant2 = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human[['RNAcentral ID','DB Description']].drop_duplicates().rename(
    columns={'DB Description':'MIRNA_Name'}), on='MIRNA_Name').drop(columns=['MIRNA_Name']).rename(columns={'RNAcentral ID':'RNA1'})

miRNA_variant2 = pd.merge(miRNA_variant2, rnacentral_map_human_refseq[['RNAcentral ID','RefSeq ID']].drop_duplicates().rename(columns={'RefSeq ID':'refseq_id'}),
                        on='refseq_id', how='left')
miRNA_variant2['RNAcentral ID'] = miRNA_variant2['RNAcentral ID'].fillna(miRNA_variant2['refseq_id'])
miRNA_variant2 = miRNA_variant2.drop(columns=['refseq_id'])
miRNA_variant2 = miRNA_variant2.rename(columns={'RNAcentral ID':'Interactor'})

miRNA_variant2['diseases'] = miRNA_variant2['diseases'].str.lower().str.strip()
miRNA_variant2 = pd.merge(miRNA_variant2, disease_map, right_on='0_y', left_on='diseases', how='left')
miRNA_variant2['0_x'] = miRNA_variant2['0_x'].fillna(miRNA_variant2['diseases'])
miRNA_variant2 = miRNA_variant2.drop(columns=['0_y', 'diseases'])
miRNA_variant2 = miRNA_variant2.rename(columns={'0_x':'Location'})
miRNA_variant2['Location'] = miRNA_variant2['Location'].replace("not provided", np.nan)
miRNA_variant2['Location'] = miRNA_variant2['Location'].replace("<NA>", np.nan)

miRNA_variant2['Source'] = 'miRdSNP'
miRNA_variant2.rename(columns={'rsid':':START_ID', 'RNA1':':END_ID'},inplace=True)
miRNA_variant2.head(n=3)

In [None]:
SNP_causally_influences_RNA = pd.concat([miRNA_variant2, miRNA_variant, variant_piRNA, gene_variant, snp_mrna, snp_mirna, clinvar_data])
SNP_causally_influences_RNA = SNP_causally_influences_RNA.groupby([':START_ID',':END_ID']).agg({'Interactor':set,
                                                                                                'Location':set,'Source':set}).reset_index()
SNP_causally_influences_RNA[':TYPE'] = 'causally_influences'
SNP_causally_influences_RNA.to_pickle(unprocessed_edge_data_location+'SNP_causally_influences_RNA.pkl')

RNA_causally_influenced_by_SNP = SNP_causally_influences_RNA.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
RNA_causally_influenced_by_SNP[':TYPE'] = 'causally_influenced_by'
RNA_causally_influenced_by_SNP.to_pickle(unprocessed_edge_data_location+'RNA_causally_influenced_by_SNP.pkl')
RNA_causally_influenced_by_SNP.head(n=3)

***
### COSMIC - http://purl.obolibrary.org/obo/RO_0002566 (causally influences) - RNA

* COSMIC

In [None]:
# https://cancer.sanger.ac.uk/cosmic/download/cosmic/v101/noncodingvariantsvcfnormal --> "Non-Coding Variants VCF Normal" tab --> 
# --> Cosmic_NonCodingVariants_VcfNormal_v101_GRCh38. --> Register and click on "Download in browser"

In [None]:
cosmic = pd.read_csv(unprocessed_data_location + 'Cosmic_NonCodingVariants_Normal_v101_GRCh38.vcf', sep='\t', comment='#',
                     names=['chr','pos',':START_ID','REF','ALT','FILTER','QUAL', 'INFO'])[[':START_ID','INFO']]
cosmic = cosmic[cosmic['INFO'].str.contains('IS_CANONICAL=y')]
cosmic['INFO'] = cosmic['INFO'].str.split('TRANSCRIPT=').str[1]
cosmic['INFO'] = cosmic['INFO'].str.split(';').str[0].str.split(".").str[0]
cosmic = cosmic.merge(rnacentral_map_human_ensembl[['RNAcentral ID','Ensembl transcript ID']].drop_duplicates().rename(
    columns={'Ensembl transcript ID':'INFO'}), on='INFO')
cosmic['RNAcentral ID'] = cosmic['RNAcentral ID'].fillna(cosmic['INFO'])
cosmic = cosmic.rename(columns={'RNAcentral ID':':END_ID'}).drop(columns=['INFO'])
cosmic = cosmic[(cosmic[':START_ID'] != ".") & (cosmic[':END_ID'] != ".")]
cosmic['Source'] = 'COSMIC'
cosmic.head(n=3)

* [LncBook](https://ngdc.cncb.ac.cn/lncbook/)

In [None]:
!wget https://ngdc.cncb.ac.cn/lncbook/files/variation_LncBook2.0.csv.gz -O ../resources/processed_data/unprocessed_data/variation_LncBook2.0.csv.gz

In [None]:
lncRNA_disease2 = pd.read_csv(unprocessed_data_location+'variation_LncBook2.0.csv.gz').drop(
    columns=['Symbol','ClinVar Allele ID','ClinVar Variation Effect','ClinVar Disease Name','Variant Name','dbSNP ID']) # Mondo+HPO
print(lncRNA_disease2['COSMIC Variation Effect'].unique())
lncRNA_disease2 = lncRNA_disease2[lncRNA_disease2['COSMIC Variation Effect'] == 'Pathogenic'].drop(columns=['COSMIC Variation Effect'])
lncRNA_disease2 = lncRNA_disease2[lncRNA_disease2['COSMIC Tumor Name'] != '-']
lncRNA_disease2['COSMIC Tumor Name'] = lncRNA_disease2['COSMIC Tumor Name'].str.split(';')
lncRNA_disease2 = lncRNA_disease2.explode('COSMIC Tumor Name')
lncRNA_disease2['COSMIC Tumor Name'] = lncRNA_disease2['COSMIC Tumor Name'].str.replace(r"\(.*?\)", "", regex=True)
lncRNA_disease2['COSMIC Tumor Name'] = [desc[1:] if desc.startswith(' ') else
                                       desc for desc in lncRNA_disease2['COSMIC Tumor Name']]
lncRNA_disease2['COSMIC Tumor Name'] = lncRNA_disease2['COSMIC Tumor Name'].str.lower()

lncRNA_disease2 = pd.merge(lncRNA_disease2, rnacentral_map_human_lncbook[['RNAcentral ID','LncBook Gene ID']].drop_duplicates().rename(
    columns={'LncBook Gene ID':'Gene ID'}), on='Gene ID').drop(columns=['Gene ID']).rename(columns={'RNAcentral ID':'RNA'})

lncRNA_disease2 = pd.merge(lncRNA_disease2, disease_map, right_on='0_y', left_on='COSMIC Tumor Name', how='left')
lncRNA_disease2['0_x'] = lncRNA_disease2['0_x'].fillna(lncRNA_disease2['COSMIC Tumor Name'])
lncRNA_disease2 = lncRNA_disease2.drop(columns=['0_y', 'COSMIC Tumor Name'])
lncRNA_disease2 = lncRNA_disease2.rename(columns={'0_x':'Location'})

lncRNA_disease2['Source'] = 'LncBook'
lncRNA_disease2.rename(columns={'RNA':':END_ID','COSMIC Mutation ID':':START_ID'}, inplace=True) 
lncRNA_disease2.head(n=3)

* SomamiR

In [None]:
miRNA_lncRNA2 = pd.read_csv(unprocessed_data_location+'lncRNA_somatic_v2.0.txt.tar.gz',sep='\t',dtype={'PMID':str})
miRNA_lncRNA2.drop(columns=['Unnamed: 18'],inplace=True) # Mondo+HPO
miRNA_lncRNA2.rename(columns={'lncRNA_somatic_v2.0.txt':'Gene'},inplace=True)
miRNA_lncRNA2['Gene'] = miRNA_lncRNA2['Gene'].str.replace(r'lnc-', '')
miRNA_lncRNA2.rename(columns={'Gene': 'symbol', 'miRNA':'mir_id'}, inplace=True)
miRNA_lncRNA2 = miRNA_lncRNA2[miRNA_lncRNA2['TargetScan_Site(0=No;1=Yes)'] == 1]

lncpedia_map = pd.read_csv("https://lncipedia.org/downloads/lncipedia_5_0/lncipedia_5_0_vs_5_2.txt", sep='\t')
miRNA_lncRNA2 = miRNA_lncRNA2.merge(lncpedia_map.rename(columns={'LNCipedia 5.0 Transcript ID':'Transcript'}),
                                    on='Transcript').drop(columns=['Transcript']).rename(
                                        columns={'LNCipedia 5.2 Transcript ID':'Transcript'})
miRNA_lncRNA2 = pd.merge(rnacentral_map_human_lncipedia.rename(columns={'LNCipedia transcript ID':'Transcript'}),
                         miRNA_lncRNA2, on='Transcript').drop(columns=['Transcript','LNCipedia Gene ID']).rename(
                             columns={'RNAcentral ID':'RNA'})

miRNA_lncRNA2 = miRNA_lncRNA2.drop(columns=['symbol','Chromosome','Location','Ref_Allele','Sample_Name',
                                            'Mut_Allele','FuncClass','Alteration','Target_Site',
                                            'Seed','SeedClass', 'TargetScan_Site(0=No;1=Yes)','Mut_ID'])

miRNA_lncRNA2 = miRNA_lncRNA2[miRNA_lncRNA2['Cancer_Class'].notna()]
miRNA_lncRNA2['Cancer_Class'] = miRNA_lncRNA2['Cancer_Class'].str.lower()
miRNA_lncRNA2['Cancer_Class'].str.replace('[ns]','[cancer]')
miRNA_lncRNA2['Cancer_Class'] = miRNA_lncRNA2['Cancer_Class'].apply(
    lambda x: ' '.join([x.split('[')[1].split(']')[0], x.split('[')[-1].split(']')[0]])).str.replace('_', ' ')

miRNA_lncRNA2['PMID'] = pd.to_numeric(miRNA_lncRNA2['PMID'], errors='coerce')
miRNA_lncRNA2['PMID'] = miRNA_lncRNA2['PMID'].astype(str)
miRNA_lncRNA2['PMID'] = miRNA_lncRNA2['PMID'].str.replace(".0", "")
miRNA_lncRNA2['PMID'] = miRNA_lncRNA2['PMID'].replace("<NA>", np.nan)

miRNA_lncRNA2 = pd.merge(miRNA_lncRNA2, disease_map, right_on='0_y', left_on='Cancer_Class', how='left')
miRNA_lncRNA2['0_x'] = miRNA_lncRNA2['0_x'].fillna(miRNA_lncRNA2['Cancer_Class'])
miRNA_lncRNA2 = miRNA_lncRNA2.drop(columns=['0_y', 'Cancer_Class'])
miRNA_lncRNA2 = miRNA_lncRNA2.rename(columns={'0_x':'Location'})

miRNA_lncRNA2['Source'] = 'SomamiR'
miRNA_lncRNA3 = miRNA_lncRNA2.copy()
miRNA_lncRNA3['Source'] = 'TargetScan'
miRNA_lncRNA2 = pd.concat([miRNA_lncRNA2, miRNA_lncRNA3])
miRNA_lncRNA2.rename(columns={'RNA':':END_ID','COSMIC_ID':':START_ID','mir_id':'Interactor','PMID':'PubMedID'}, inplace=True)
miRNA_lncRNA2.head(n=3)

In [None]:
data_downloader('https://compbio.uthsc.edu/SomamiR/download/miRNA_somatic_v2.0.txt.tar.gz', unprocessed_data_location)

In [None]:
somamir = pd.read_csv(unprocessed_data_location +
                      'miRNA_somatic_v2.0.txt.tar',sep='\t').drop(
                          columns=['Reference','Derived','SNP','Whole_Genome','Whole_Exome','Study_ID','Source',
                          'miRNA_Chromosome','Strand','Maturestart','Matureend','Mutation_Distance','Regioin',
                            'miR2GO_Execution_Sequence','Unnamed: 19','Sample_Name']) # Mondo+HPO
somamir.Cancer_Type = somamir.Cancer_Type.str.replace("[NS]","")
somamir = somamir[somamir.Cancer_Type!=""]
somamir['Cancer_Type'] = somamir['Cancer_Type'].apply(
    lambda x: ' '.join([x.split('[')[1].split(']')[0], x.split('[')[-1].split(']')[0]])).str.replace('_', ' ')

somamir = pd.merge(somamir, disease_map, right_on='0_y', left_on='Cancer_Type', how='left')
somamir['0_x'] = somamir['0_x'].fillna(somamir['Cancer_Type'])
somamir = somamir.drop(columns=['0_y', 'Cancer_Type'])
somamir = somamir.rename(columns={'0_x':'Location'})

print(all(somamir['miRNA_Name'].isin(rnacentral_map_human['DB Description'])))
print(somamir[~somamir['miRNA_Name'].isin(rnacentral_map_human['DB Description'])]['miRNA_Name'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = somamir[~somamir['miRNA_Name'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = somamir[~somamir['miRNA_Name'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['miRNA_Name'] = miRNA_RNA_miRNAnotInRNAcentral5p['miRNA_Name'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['miRNA_Name'] = miRNA_RNA_miRNAnotInRNAcentral3p['miRNA_Name'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'miRNA_Name'}), on='miRNA_Name').drop(columns=['miRNA_Name']).rename(columns={'RNAcentral ID':'RNA'})

somamir = pd.merge(somamir, rnacentral_map_human.rename(
    columns={'DB Description':'miRNA_Name'}), on='miRNA_Name').drop(
        columns=['DB','DB ID','miRNA_Name','Organism','RNA category']).rename(columns={'RNAcentral ID':'RNA'})
somamir = pd.concat([somamir, miRNA_RNA_miRNAnotInRNAcentral]).drop(columns=['DB','DB ID','Mutation_ID','Organism','RNA category'])

somamir['Source'] = 'SomamiR'
somamir = somamir.rename(columns={'RNA':':END_ID','Disease':'Location','COSMIC_ID':':START_ID'})
somamir.head(n=3)

In [None]:
circRNA_miRNA = pd.read_csv(unprocessed_data_location + 'circRNA_somatic_v2.0.txt.tar.gz', sep="\t")
circRNA_miRNA = circRNA_miRNA[circRNA_miRNA['TargetScan_Site(0=No;1=Yes)'] == 1]
circRNA_miRNA = circRNA_miRNA.drop(columns=['Gene','Mut_ID','Chromosome','Location','Mut_Allele','FuncClass','Alteration','Ref_Allele',
                                            'Target_Site','Seed','SeedClass','TargetScan_Site(0=No;1=Yes)','Sample_Name','Unnamed: 18'])

circRNA_miRNA['PMID'] = pd.to_numeric(circRNA_miRNA['PMID'], errors='coerce')
circRNA_miRNA['PMID'] = circRNA_miRNA['PMID'].astype(str)
circRNA_miRNA['PMID'] = circRNA_miRNA['PMID'].str.replace(".0", "")
circRNA_miRNA['PMID'] = circRNA_miRNA['PMID'].replace("<NA>", np.nan)
circRNA_miRNA['PMID'] = circRNA_miRNA['PMID'].replace("nan", np.nan)

circRNA_miRNA['Cancer_Class'] = circRNA_miRNA['Cancer_Class'].str.lower()
circRNA_miRNA['Cancer_Class'] = circRNA_miRNA['Cancer_Class'].str.replace('[ns]','')
circRNA_miRNA['Cancer_Class'] = circRNA_miRNA['Cancer_Class'].str.replace('_',' ')
circRNA_miRNA['Cancer_Class'] = circRNA_miRNA['Cancer_Class'].astype(str).apply(
     lambda x: ' '.join(re.findall(r'\[(.*?)\]', x)) if '[' in x and ']' in x else x)
circRNA_miRNA = circRNA_miRNA.explode('Cancer_Class')

circRNA_miRNA = pd.merge(circRNA_miRNA, disease_map, right_on='0_y', left_on='Cancer_Class', how='left')
circRNA_miRNA['0_x'] = circRNA_miRNA['0_x'].fillna(circRNA_miRNA['Cancer_Class'])
circRNA_miRNA = circRNA_miRNA.drop(columns=['0_y', 'Cancer_Class'])
circRNA_miRNA = circRNA_miRNA.rename(columns={'0_x':'Location'})

circRNA_miRNA['Source'] = 'SomamiR'
circRNA_miRNA2 = circRNA_miRNA.copy()
circRNA_miRNA2['Source'] = 'TargetScan'
circRNA_miRNA = pd.concat([circRNA_miRNA, circRNA_miRNA2])
RNA_RNA8 = circRNA_miRNA.rename(columns={'Transcript':':END_ID','COSMIC_ID':':START_ID','miRNA':'Interactor',
                                         'PMID':'PubMedID'}).drop_duplicates()
RNA_RNA8.head(n=3)

In [None]:
cosmic_causally_influences_OBO = pd.concat([cosmic, lncRNA_disease2, miRNA_lncRNA2, somamir, RNA_RNA8]).drop(
    columns=['GWAS Trait']).drop_duplicates()
cosmic_causally_influences_OBO = cosmic_causally_influences_OBO.groupby([':START_ID',':END_ID']).agg({'Source':set,'Location':set,
                                                                                                      'Interactor':set,
                                                                                                      'PubMedID':set}).reset_index()
cosmic_causally_influences_OBO[':TYPE'] = 'causally_influences'
cosmic_causally_influences_OBO.to_pickle(unprocessed_edge_data_location+'COSMIC_causally_influences_RNA.pkl')

OBO_causally_influenced_by_COSMIC = cosmic_causally_influences_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_causally_influenced_by_COSMIC[':TYPE'] = 'causally_influenced_by'
OBO_causally_influenced_by_COSMIC.to_pickle(unprocessed_edge_data_location+'RNA_causally_influenced_by_COSMIC.pkl')
OBO_causally_influenced_by_COSMIC.head(n=3)

***
### SNP - http://purl.obolibrary.org/obo/RO_0002566 (causally influences) - OBO

* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
!wget https://www.dropbox.com/s/f87f2q9ryjs3il9/miRNet-snptfbs-hsa.csv?dl=0 -O ../resources/processed_data/unprocessed_data/miRNet-snptfbs-hsa.csv?dl=0

In [None]:
TF_variant = pd.read_csv(unprocessed_data_location + "miRNet-snptfbs-hsa.csv?dl=0") # TF (PRO)
TF_variant.drop(columns=['chr_pos','mirnet','entrez','name'],inplace=True)

TF_variant = pd.merge(TF_variant,symbol_to_pro.rename(columns={0:'symbol'}),on=['symbol'])
TF_variant.drop(columns=['symbol'],inplace=True)
TF_variant.rename(columns={'rsid':'SNP', 1:'TF'},inplace=True)
TF_variant = TF_variant[['SNP','TF']]
TF_variant['Source'] = 'miRNet'
TF_variant.rename(columns={'SNP':':START_ID', 'TF':':END_ID'},inplace=True)
TF_variant.head(n=3)

In [None]:
'''
# Since we are dealing with >2M edges, we keep only those whose SNP is already considered by other sources.
SNPnonO_data = pd.concat([pd.read_csv('../resources/edge_data/variant-piRNA2566.txt',sep='\t').rename(columns={'rs Id':'SNP'})['SNP'],
                          pd.read_csv('../resources/edge_data/variant-miRNA2566.txt',sep='\t')['SNP'],
                          pd.read_csv('../resources/edge_data/variant-premiRNA2566.txt', sep='\t')['SNP'],
                          pd.read_csv('../resources/edge_data/variant-gene2566.txt', sep='\t')['SNP'],
                          pd.read_csv('../resources/edge_data/variant-disease3302.txt', sep='\t')['SNP'],
                          pd.read_csv('../resources/edge_data/variant-phenotype3302.txt', sep='\t')['SNP']]).drop_duplicates()

# This contains HDMM KG rsIDs
nonO_data = pd.read_pickle(r'../resources/construction_approach/'+'subclass_construction_mapORIGINAL.pkl')
nonO_data_series = pd.Series(nonO_data)

b = pd.DataFrame(nonO_data_series)
b['SNP'] = b.index

TF_variant = TF_variant[(TF_variant['SNP'].isin(SNPnonO_data)) | (TF_variant['SNP'].isin(b['SNP']))]
TF_variant
'''

* [miRdSNP](http://mirdsnp.ccr.buffalo.edu/index.php)

In [None]:
!wget http://mirdsnp.ccr.buffalo.edu/downloads/mirdsnp-dsnps-v11.03.csv -O ../resources/processed_data/unprocessed_data/mirdsnp-dsnps-v11.03.csv

In [None]:
disease_variant = pd.read_csv(unprocessed_data_location+'mirdsnp-dsnps-v11.03.csv') # Mondo+HPO
disease_variant.drop(columns=['pub_year','pub_month','link','journal','article_date','title'],inplace=True)
disease_variant.disease = disease_variant.disease.str.lower()
disease_variant['snps'] = disease_variant.snps.str.split(',')
disease_variant = disease_variant.explode('snps')

disease_variant = pd.merge(disease_variant, desc_disPhe_map.rename(columns={0:'disease'}),on='disease')
disease_variant.drop(columns=['disease'],inplace=True)

disease_variant['pubmed_id'] = pd.to_numeric(disease_variant['pubmed_id'], errors='coerce')
disease_variant['pubmed_id'] = disease_variant['pubmed_id'].astype(str)
disease_variant['pubmed_id'] = disease_variant['pubmed_id'].str.replace(".0", "")
disease_variant['pubmed_id'] = disease_variant['pubmed_id'].replace("<NA>", np.nan)

disease_variant = disease_variant.rename(columns={'pubmed_id':'PMID','snps':'SNP', 1:'Disease'})
disease_variant['Source'] = 'miRdSNP' 
disease_variant.rename(columns={'Disease':':END_ID', 'SNP':':START_ID', 'PMID':'PubMedID'},inplace=True)
disease_variant.head(n=3)

In [None]:
SNP_causally_influences_OBO = pd.concat([TF_variant, disease_variant])
SNP_causally_influences_OBO = SNP_causally_influences_OBO.groupby([':START_ID',':END_ID']).agg({'PubMedID':set,'Source':set}).reset_index()
SNP_causally_influences_OBO[':TYPE'] = 'causally_influences'
SNP_causally_influences_OBO.to_pickle(unprocessed_edge_data_location+'SNP_causally_influences_OBO.pkl')

OBO_causally_influenced_by_SNP = SNP_causally_influences_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_causally_influenced_by_SNP[':TYPE'] = 'causally_influenced_by'
OBO_causally_influenced_by_SNP.to_pickle(unprocessed_edge_data_location+'OBO_causally_influenced_by_SNP.pkl')
OBO_causally_influenced_by_SNP.head(n=3)

***
### Chemical modification - http://purl.obolibrary.org/obo/RO_0002314 (characteristic of part of) - RNA

* [GtRNAdb](http://gtrnadb.ucsc.edu/GtRNAdb2/index.html) <br /> 
The genomic tRNA database contains tRNA gene predictions made by tRNAscan-SE on complete or nearly complete genomes. Unless otherwise noted, all annotation is automated, and has not been inspected for agreement with published literature.

In [None]:
!wget http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/hg38-tRNAs.fa -O ../resources/processed_data/unprocessed_data/hg38-tRNAs.fa

In [None]:
identifiers = []
seq = []

fasta_file_path = unprocessed_data_location + 'hg38-tRNAs.fa'

with open(fasta_file_path) as fasta_file:
    for title, sequence in SimpleFastaParser(fasta_file):
        identifiers.append(title.split(None, 1)[0])  # First word is ID
        seq.append(sequence)
        
data = {"Identifier": identifiers, "Sequence": seq}
df = pd.DataFrame(data)
df['Identifier'] = df['Identifier'].str[len('Homo_sapiens_'):]
df

In [None]:
tRNA = pd.DataFrame()
for identifier in df['Identifier'] [1:] :

    temp = pd.read_html('http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/genes/' + identifier + '.html')[0].T
    temp2 = pd.read_html('http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/genes/' + identifier + '.html')[1].T
    temp = pd.concat([temp,temp2],axis=1)
    temp.columns = temp.iloc[0]
    temp = temp[1:]
    tRNA = pd.concat([tRNA, temp])

tRNA.Locus = tRNA.Locus.str.replace(' View in Genome Browser', '')
tRNA = tRNA[['GtRNAdb Gene Symbol', 'Known Modifications (Modomics)']]
tRNA

In [None]:
tRNA['Known Modifications (Modomics)'] = tRNA['Known Modifications (Modomics)'].str.split(' ')
tRNA_mod = tRNA[['GtRNAdb Gene Symbol', 'Known Modifications (Modomics)']].explode('Known Modifications (Modomics)').dropna()
tRNA_mod = tRNA_mod.rename(columns={'GtRNAdb Gene Symbol':'tRNA', 'Known Modifications (Modomics)':'Modification'})

#tRNA_mod = pd.read_csv(edge_data_location+'modification-tRNA2314.txt', sep='\t').drop(columns=['Source(s)'])
tRNA_mod['tRNA'] = tRNA_mod['tRNA'].str.replace('.html', '')
tRNA_mod = pd.merge(tRNA_mod, rnacentral_map_human_gtrnadb[['RNAcentral ID','GtRNAdb Gene ID']].drop_duplicates().rename(
    columns={'GtRNAdb Gene ID':'tRNA'}), on='tRNA').drop(
        columns=['tRNA']).rename(columns={'RNAcentral ID':'RNA'})

tRNA_mod['Source'] = 'GtRNAdb, Modomics'
tRNA_mod['Source'] = tRNA_mod['Source'].str.split(', ')
tRNA_mod = tRNA_mod.explode('Source')
tRNA_mod.rename(columns={'RNA':':END_ID','Modification':':START_ID'},inplace=True)
tRNA_mod.head(n=3)

In [None]:
tRNA_mod[':TYPE'] = 'characteristic_of_part_of'
tRNA_mod = tRNA_mod.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
tRNA_mod.to_pickle(unprocessed_edge_data_location+'ChemicalModification_characteristic_of_part_of_RNA.pkl')
tRNA_mod.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002434 (interacts with) - OBO

* CTD

In [None]:
ctd_pro = pd.read_csv(unprocessed_data_location+'CTD_chem_gene_ixns.tsv', sep='\t', comment="#",
                          names=['ChemicalName','ChemicalID','CasRN','GeneSymbol','GeneID','GeneForms',
                                 'Organism','OrganismID','Interaction','InteractionActions','PubMedIDs'])
ctd_pro = ctd_pro[ctd_pro['InteractionActions'] != 'affects']
ctd_pro = ctd_pro[ctd_pro['Organism'] == 'Homo sapiens']
ctd_pro = ctd_pro[(ctd_pro['GeneForms'].notna()) & (ctd_pro['GeneForms'].str.startswith("protein"))]
ctd_pro = ctd_pro[['ChemicalID','GeneID','PubMedIDs']]
ctd_pro['ChemicalID'] = 'MESH_' + ctd_pro['ChemicalID']

ctd_pro['PubMedIDs'] = pd.to_numeric(ctd_pro['PubMedIDs'], errors='coerce')
ctd_pro['PubMedIDs'] = ctd_pro['PubMedIDs'].astype(str)
ctd_pro['PubMedIDs'] = ctd_pro['PubMedIDs'].str.replace(".0", "")
ctd_pro['PubMedIDs'] = ctd_pro['PubMedIDs'].replace("<NA>", np.nan)

ctd_pro = pd.merge(ctd_pro, mesh_to_chebi.rename(columns={0:'ChemicalID'}), on='ChemicalID').drop(columns=['ChemicalID'])
ctd_pro = ctd_pro.merge(entrez_pro_map.rename(columns={0:'GeneID'}), on='GeneID')
ctd_pro = ctd_pro[['PubMedIDs','1_x','1_y']].rename(columns={'PubMedIDs':'PubMedID','1_x':':START_ID','1_y':':END_ID'})
ctd_pro['Source'] = 'CTD'
ctd_pro.head(n=3)

In [None]:
ctd_pro[':TYPE'] = 'interacts_with'
ctd_pro_inv = ctd_pro.rename(columns={':END_ID':':START_ID',':START_ID':':END_ID'})
ctd_pro = pd.concat([ctd_pro, ctd_pro_inv])
ctd_pro = ctd_pro.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
ctd_pro.to_pickle(unprocessed_edge_data_location+'OBO_interacts_with_OBO.pkl')
ctd_pro.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002436 (molecularly interacts with) - OBO

* STRING

In [None]:
data_downloader("https://stringdb-downloads.org/download/protein.links.v12.0/9606.protein.links.v12.0.txt.gz", unprocessed_data_location)

In [None]:
string = pd.read_csv(unprocessed_data_location + "9606.protein.links.v12.0.txt", sep='\s')
string = string[string['combined_score'] >= 700]
string['protein1'] = string['protein1'].str.replace("9606.","")
string['protein2'] = string['protein2'].str.replace("9606.","")
string = pd.merge(string, string_pro.rename(columns={0:'protein1'}), on='protein1').merge(
    string_pro.rename(columns={0:'protein2'}), on='protein2')
string = string[['1_x','1_y','combined_score']]
string = string.rename(columns={'1_x':':START_ID','1_y':':END_ID','combined_score':'STRING_score'})
string['Source'] = 'STRING'
string.head(n=3)

* UniProtKB

In [None]:
url = 'https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Cxref_pro%2Ccc_cofactor%2Ccc_catalytic_activity&format=tsv&query=%28%28taxonomy_id%3A9606%29+AND+%28reviewed%3Atrue%29%29'
data_downloader(url, unprocessed_data_location, 'uniprot-cofactor-catalyst.tab')

In [None]:
df = pd.read_csv(unprocessed_data_location + 'uniprot-cofactor-catalyst.tab', sep='\t').drop(columns=['Entry','Catalytic activity'])
df['PRO'] = df['PRO'].str.split(';')
df = df.explode('PRO')
df = df[(df['PRO'].notna()) & (df['PRO'].str.startswith('PR'))]
df['PRO'] = df['PRO'].str.replace('PR:', 'PR_')
df['Cofactor'] = df['Cofactor'].str.extract(r'(CHEBI:[^;]+)', expand=False)
df['Cofactor'] = df['Cofactor'].str.replace('CHEBI:', 'CHEBI_')
df['Cofactor'] = df['Cofactor'].str.split(', ')
df = df.explode('Cofactor')
df = df.dropna()
df['Source'] = 'UniProtKB'
cofactor = df.rename(columns={'Cofactor':':START_ID','PRO':':END_ID'})
cofactor.head(n=3)

In [None]:
df = pd.read_csv(unprocessed_data_location + 'uniprot-cofactor-catalyst.tab', sep='\t').drop(columns=['Entry','Cofactor'])
df['PRO'] = df['PRO'].str.split(';')
df = df.explode('PRO')
df = df[(df['PRO'].notna()) & (df['PRO'].str.startswith('PR'))]
df['PRO'] = df['PRO'].str.replace('PR:', 'PR_')
df['Catalytic activity'] = df['Catalytic activity'].str.extract(r'(CHEBI:[^;]+)', expand=False)
df['Catalytic activity'] = df['Catalytic activity'].str.replace('ChEBI:CHEBI:', 'CHEBI:')
df['Catalytic activity'] = df['Catalytic activity'].str.replace('CHEBI:', 'CHEBI_')
df['Catalytic activity'] = df['Catalytic activity'].str.split(', ')
df = df.explode('Catalytic activity')
df = df.dropna()
df['Source'] = 'UniProtKB'
catalyst = df.rename(columns={'Catalytic activity':':START_ID','PRO':':END_ID'})
catalyst.head(n=3)

* CTD

In [None]:
data_downloader("https://ctdbase.org/reports/CTD_chem_go_enriched.tsv.gz", unprocessed_data_location)

In [None]:
ctd_go = pd.read_csv(unprocessed_data_location+'CTD_chem_go_enriched.tsv', sep='\t', comment="#",
                          names=['ChemicalName','ChemicalID','CasRN','Ontology','GOTermName','GOTermID',
                                 'HighestGOLevel','PValue','CorrectedPValue','TargetMatchQty','TargetTotalQty',
                                 'BackgroundMatchQty','BackgroundTotalQty'])

ctd_go = ctd_go[ctd_go['CorrectedPValue']<=1.04e-47]
ctd_go = ctd_go[['ChemicalID','GOTermID','CorrectedPValue','TargetMatchQty']]
ctd_go['ChemicalID'] = "MESH_" + ctd_go['ChemicalID']
ctd_go['GOTermID'] = ctd_go['GOTermID'].str.replace('GO:', 'GO_')
ctd_go.head(n=3)

In [None]:
ctd_go = pd.merge(ctd_go, mesh_to_chebi.rename(columns={0:'ChemicalID'}), on='ChemicalID')
ctd_go = ctd_go.rename(columns={'GOTermID':':START_ID', 1:':END_ID', 'CorrectedPValue':'FDR'})
ctd_go['Source'] = 'CTD'
ctd_go.head(n=3)

In [None]:
ctd_pro = pd.read_csv(unprocessed_data_location+'CTD_chem_gene_ixns.tsv', sep='\t', comment="#",
                          names=['ChemicalName','ChemicalID','CasRN','GeneSymbol','GeneID','GeneForms',
                                 'Organism','OrganismID','Interaction','InteractionActions','PubMedIDs'])
ctd_pro = ctd_pro[ctd_pro['InteractionActions'] != 'affects']
ctd_pro = ctd_pro[ctd_pro['Organism'] == 'Homo sapiens']
ctd_pro = ctd_pro[(ctd_pro['GeneForms'].notna()) & (ctd_pro['GeneForms'].str.startswith("protein"))]
ctd_pro = ctd_pro[['ChemicalID','GeneID','PubMedIDs']]
ctd_pro['ChemicalID'] = 'MESH_' + ctd_pro['ChemicalID']
ctd_pro.head(n=3)

In [None]:
ctd_pro = pd.merge(ctd_pro, mesh_to_chebi.rename(columns={0:'ChemicalID'}), on='ChemicalID').drop(columns=['ChemicalID'])
ctd_pro = ctd_pro.merge(entrez_pro_map.rename(columns={0:'GeneID'}), on='GeneID')

ctd_pro['PubMedIDs'] = pd.to_numeric(ctd_pro['PubMedIDs'], errors='coerce')
ctd_pro['PubMedIDs'] = ctd_pro['PubMedIDs'].astype(str)
ctd_pro['PubMedIDs'] = ctd_pro['PubMedIDs'].str.replace(".0", "")
ctd_pro['PubMedIDs'] = ctd_pro['PubMedIDs'].replace("<NA>", np.nan)

ctd_pro['Source'] = 'CTD'
ctd_pro = ctd_pro.rename(columns={'1_x':':START_ID','1_y':':END_ID','PubMedIDs':'PubMedID'})
ctd_pro.head(n=3)

In [None]:
OBO_molecularly_interacts_with_OBO = pd.concat([ctd_pro, ctd_go, catalyst, cofactor, string])
OBO_molecularly_interacts_with_OBO[':TYPE'] = 'molecularly_interacts_with'
OBO_molecularly_interacts_with_OBO_inv = OBO_molecularly_interacts_with_OBO.rename(columns={':END_ID':':START_ID',':START_ID':':END_ID'})
OBO_molecularly_interacts_with_OBO = pd.concat([OBO_molecularly_interacts_with_OBO, OBO_molecularly_interacts_with_OBO_inv])
OBO_molecularly_interacts_with_OBO = OBO_molecularly_interacts_with_OBO.groupby(
    [':START_ID',':END_ID']).agg({'FDR':np.mean, 'Source':set}).reset_index()
OBO_molecularly_interacts_with_OBO.to_pickle(unprocessed_edge_data_location+'OBO_molecularly_interacts_with_OBO.pkl')
OBO_molecularly_interacts_with_OBO.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002200 (has phenotype) - OBO

* HPO

In [None]:
# https://hpo.jax.org/data/annotations --> Click on the "DOWNLOAD HPO ANNOTATIONS" button 

In [None]:
dis_phen = pd.read_csv(unprocessed_data_location+'phenotype.hpoa', sep='\t', comment="#")[['database_id', 'hpo_id','reference','frequency']]
dis_phen['database_id'] = dis_phen['database_id'].str.split(":").str[1]
dis_phen['hpo_id'] = dis_phen['hpo_id'].str.replace(":", "_")
dis_phen['reference'] = dis_phen['reference'].str.replace("PMID:", "")
dis_phen['reference'] = pd.to_numeric(dis_phen['reference'], errors='coerce')
dis_phen['reference'] = dis_phen['reference'].astype(str)
dis_phen['reference'] = dis_phen['reference'].str.replace(".0", "")
dis_phen['reference'] = dis_phen['reference'].replace("<NA>", np.nan)
# dis_phen['frequency'] = dis_phen['frequency'].str.split("/").str[0].astype(float) / dis_phen['frequency'].str.split("/").str[1].astype(float)
dis_phen.head(n=3)

In [None]:
dis_phen = pd.merge(dis_phen, disgenet_mondo_map.rename(columns={0:'database_id'}), on='database_id')
dis_phen = dis_phen.rename(columns={1:':START_ID','hpo_id':':END_ID','reference':'PubMedID'})
dis_phen['Source'] = 'HPO' 
dis_phen.head(n=3)

In [None]:
dis_phen = dis_phen.groupby([':START_ID',':END_ID']).agg({'PubMedID':set,'Source':set}).reset_index() 
dis_phen[':TYPE'] = 'has_phenotype'  
dis_phen.to_pickle(unprocessed_edge_data_location+'OBO_has_phenotype_OBO.pkl')
dis_phen = dis_phen.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
dis_phen[':TYPE'] = 'phenotype_of'
dis_phen.to_pickle(unprocessed_edge_data_location+'OBO_phenotype_of_OBO.pkl')
dis_phen.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002436 (molecularly interacts with) - OBO

* [Apta-Index](https://www.aptagen.com/apta-index/) <br/>
Apta-index is the most advanced user-friendly database on aptamers. Aptagen does not list this information contained herein as products but as a database of information obtained from the published literature. 

In [None]:
# Manual collection of data from https://www.aptagen.com/apta-index/
aptamer_protein = pd.read_csv(unprocessed_data_location + 'aptaindex.csv',names=['Name', 'ID', 'Target', 'Sequence']) # PRO
aptamer_protein.Target = aptamer_protein.Target.str.lower()
aptamer_protein = pd.merge(aptamer_protein, desc_pro_map.rename(columns={0:'Target'}),on='Target')
aptamer_protein['ID'] = 'aptamer-details/?id=' + aptamer_protein['ID'].astype(str)
aptamer_protein = aptamer_protein.drop(columns=['Name','Target', 'Sequence'])

aptamer_protein['Source'] = 'Apta-Index'
aptamer_protein.rename(columns={'ID':':START_ID',1:':END_ID'},inplace=True)
aptamer_protein.head(n=3)

In [None]:
aptamer_chemical = pd.read_csv(unprocessed_data_location + 'aptaindex.csv',
                               names=['Name', 'ID', 'Target', 'Sequence']) # ChEBI+DrugBank
aptamer_chemical.Target = aptamer_chemical.Target.str.lower()
aptamer_chemical = pd.merge(aptamer_chemical,
                            desc_chebi_map.rename(columns={0:'Target'}),on='Target')

aptamer_chemical['ID'] = 'aptamer-details/?id=' + aptamer_chemical['ID'].astype(str)
aptamer_chemical = aptamer_chemical.drop(columns=['Name','Target','Sequence'])

aptamer_chemical['Source'] = 'Apta-Index'
aptamer_chemical.rename(columns={'ID':':START_ID',1:':END_ID'},inplace=True)
aptamer_chemical.head(n=3)

* LncBook

In [None]:
!wget https://ngdc.cncb.ac.cn/lncbook/files/lncrna_rbp_LncBook2.0.csv.gz -O ../resources/processed_data/unprocessed_data/lncrna_rbp_LncBook2.0.csv.gz

In [None]:
lncRNA_protein = pd.read_csv(unprocessed_data_location + 'lncrna_rbp_LncBook2.0.csv.gz') # RBP (PRO) 
lncRNA_protein.drop(columns=['Gene ID','Symbol','Length'],inplace=True)

lncRNA_protein = pd.merge(lncRNA_protein, rnacentral_map_human_lncbook[['LncBook Transcript ID', 'RNAcentral ID']].rename(
    columns={'LncBook Transcript ID': 'Transcript ID'}).drop_duplicates(), on='Transcript ID').drop(columns=['Transcript ID']).rename(columns={
        'RNAcentral ID':'RNA'})
lncRNA_protein = pd.merge(lncRNA_protein, symbol_to_pro.rename(columns={0:'Protein'}), on='Protein').drop(columns=['Protein']).rename(columns={
        1:'RBP'})

lncRNA_protein['Cell Line'] = lncRNA_protein['Cell Line'].str.lower()
lncRNA_protein = lncRNA_protein.explode('Cell Line')
lncRNA_protein = pd.merge(lncRNA_protein, location_map, right_on='0_y', left_on='Cell Line', how='left')
lncRNA_protein['0_x'] = lncRNA_protein['0_x'].fillna(lncRNA_protein['Cell Line'])
lncRNA_protein = lncRNA_protein.drop(columns=['0_y', 'Cell Line'])
lncRNA_protein = lncRNA_protein.rename(columns={'0_x':'Location'})

lncRNA_protein['Binding_pos'] = lncRNA_protein['Start'].astype(str) + "-" + lncRNA_protein['End'].astype(str)
lncRNA_protein['Binding_pos'] = lncRNA_protein['Binding_pos'].replace("nan-nan", np.nan)

lncRNA_protein['Source'] = 'LncBook'
lncRNA_protein.rename(columns={'RNA':':START_ID', 'RBP':':END_ID'},inplace=True)
lncRNA_protein.head(n=3)

* [tRNAdb](http://trna.bioinf.uni-leipzig.de/DataOutput/) <br /> tRNAdb contains more than 12 000 tRNA genes, classified into families according to amino acid specificity. The database provides various services including graphical representations of tRNA secondary structures, a customizable output of aligned or un-aligned sequences with a variety of individual and combinable search criteria, as well as the construction of consensus sequences for any selected set of tRNAs.

In [None]:
#http://trna.bioinf.uni-leipzig.de/DataOutput/Result --> Download html

In [None]:
tRNA_aa = pd.read_html(unprocessed_data_location+'tRNAdb - Transfer RNA database.html')[3] # ChEBI - Amino acid
tRNA_aa.drop(columns=[0,1,2,4,19,20],inplace=True)
tRNA_aa.rename(columns=tRNA_aa.iloc[0], inplace=True)
tRNA_aa = tRNA_aa.iloc[2:]
tRNA_aa.head(n=3)

In [None]:
tRNA_aa = tRNA_MINTbase_GtRNAdb_map[['gtRNAdb name']]
tRNA_aa['gtRNAdb name'] = tRNA_aa['gtRNAdb name']
tRNA_aa['new'] = tRNA_MINTbase_GtRNAdb_map['gtRNAdb name'].str.split("-").str[1]
tRNA_aa['Amino Acid'] = tRNA_aa[tRNA_aa['new']=='tRNA']['gtRNAdb name'].str.split("-").str[2]
tRNA_aa['Amino Acid'].fillna(tRNA_aa['new'],inplace=True)
tRNA_aa.drop(columns=['new'],inplace=True)
tRNA_aa.head(n=3)

In [None]:
tRNA_aa = pd.merge(tRNA_aa, rnacentral_map_human_gtrnadb[['RNAcentral ID','GtRNAdb Gene ID']].drop_duplicates().rename(
    columns={'GtRNAdb Gene ID':'gtRNAdb name'}), on='gtRNAdb name').drop(columns=['gtRNAdb name']).rename(columns={'RNAcentral ID':'RNA'})
tRNA_aa = pd.merge(tRNA_aa, aa_chebi_map.rename(columns={0:'Amino Acid'}), on='Amino Acid')
tRNA_aa.drop(columns=['Amino Acid'],inplace=True)
tRNA_aa.rename(columns={1:'Amino acid'},inplace=True)
tRNA_aa['Source'] = 'GtRNAdb, tRNAdb'
tRNA_aa['Source'] = tRNA_aa['Source'].str.split(', ')
tRNA_aa = tRNA_aa.explode('Source')
tRNA_aa = tRNA_aa.drop_duplicates()
tRNA_aa.rename(columns={'RNA':':START_ID', 'Amino acid':':END_ID'},inplace=True)
tRNA_aa.head(n=3)

In [None]:
RNA_molecularly_interacts_with_OBO = pd.concat([lncRNA_protein, tRNA_aa, aptamer_chemical, aptamer_protein])
RNA_molecularly_interacts_with_OBO = RNA_molecularly_interacts_with_OBO.groupby([':START_ID',':END_ID']).agg(
    {'Location':set,'Binding_pos':set,'Source':set}).reset_index()
RNA_molecularly_interacts_with_OBO[':TYPE'] = 'molecularly_interacts_with'
RNA_molecularly_interacts_with_OBO.to_pickle(unprocessed_edge_data_location+'RNA_molecularly_interacts_with_OBO.pkl')
OBO_molecularly_interacts_with_RNA = RNA_molecularly_interacts_with_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_molecularly_interacts_with_RNA.to_pickle(unprocessed_edge_data_location+'OBO_molecularly_interacts_with_RNA.pkl')
OBO_molecularly_interacts_with_RNA.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002434 (interacts with) - Gene

* [RNAInter](http://www.rnainter.org/) <br/> RNAInter integrates experimentally validated and computationally predicted RNA interactome data from the literature and databases.

In [None]:
!wget http://www.rnainter.org/raidMedia/download/Download_data_RD.tar.gz -O ../resources/processed_data/unprocessed_data/Download_data_RD.tar.gz

In [None]:
RNA_gene = pd.read_csv(unprocessed_data_location+'Download_data_RD.tar.gz',
                                            sep='\t').rename(columns={'Download_data_RD.txt':'RNAInterID'})

# We select only strong evidence interactions for hsa
RNA_gene = RNA_gene[(RNA_gene['score'] >= 0.2886) & (RNA_gene['Species1'].str.contains('apiens')) &
                    (RNA_gene['Species2'].str.contains('apiens'))]

print(set(RNA_gene.Category2)) # Genes are all in the second column
print(set(RNA_gene.Category1))
RNA_gene = RNA_gene[(RNA_gene['Raw_ID2'].str != 'nan') & (RNA_gene['Raw_ID2'].str.startswith('NCBI'))]
RNA_gene.Raw_ID1 = RNA_gene.Raw_ID1.str.replace("NCBI:", '')
RNA_gene.Raw_ID2 = RNA_gene.Raw_ID2.str.replace("NCBI:", '')

RNA_gene['Raw_ID1'] = RNA_gene['Raw_ID1'].str.split(';')
RNA_gene = RNA_gene.explode('Raw_ID1')
RNA_gene['Raw_ID2'] = RNA_gene['Raw_ID2'].str.split(';')
RNA_gene = RNA_gene.explode('Raw_ID2')

RNA_gene.head(n=3)

In [None]:
i = RNA_gene[(RNA_gene['Category1']=='mRNA')].index.values
mrna = RNA_gene[RNA_gene['Category1']=='mRNA']
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ensembl_entrezTranscript_map_mrna = ensembl_entrezTranscript_map[ensembl_entrezTranscript_map[2] == 'protein-coding']
mrna = pd.merge(mrna, ensembl_entrezTranscript_map_mrna, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1',2,3,4,5]).rename(columns={0:'Raw_ID1'})
mrna.drop(columns=['Raw_ID1'],inplace=True)
mrna.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_gene.drop(index=i,inplace=True,axis=0)
RNA_gene = pd.concat([mrna,RNA_gene])

mrna.head(n=3)

In [None]:
i = RNA_gene[(RNA_gene['Category1']!='mRNA')].index.values
ncrna = RNA_gene[(RNA_gene['Category1']!='mRNA')]
rnacentral_map_human_hgnc_type = rnacentral_map_hgnc[rnacentral_map_hgnc['Organism'] ==9606]
ncrna = pd.merge(ncrna, rnacentral_map_human_hgnc_type, left_on=['Interactor1.Symbol','Category1'],
                right_on=["HGNC symbol",'RNA category']).drop(columns=["Interactor1.Symbol",'Raw_ID1','RNA category'])
ncrna.rename(columns={"RNAcentral ID":'Raw_ID1'}, inplace=True)
RNA_gene.drop(index=i,inplace=True,axis=0)
RNA_gene = pd.concat([ncrna,RNA_gene])

ncrna.head(n=3)

In [None]:
RNA_gene = RNA_gene[(RNA_gene['Raw_ID1'].str.startswith("URS")) | (RNA_gene['Raw_ID1'].str.startswith("ENST"))
                    | (RNA_gene['Raw_ID1'].str.startswith("trfdb?")) | (RNA_gene['Raw_ID1'].str.startswith("hsa_circ_"))]
RNA_gene = RNA_gene[RNA_gene['Raw_ID2'].str[0].str.isdigit()]

RNA_gene['Method'] = RNA_gene['strong'].astype(str) + "//" + RNA_gene['weak'].astype(str) + "//" + RNA_gene['predict'].astype(str)
RNA_gene['Method'] = RNA_gene['Method'].str.replace("nan\/\/", '', regex=True)
RNA_gene['Method'] = RNA_gene['Method'].str.replace("\/\/nan", '', regex=True)
RNA_gene['Method'] = RNA_gene['Method'].replace("nan", np.nan)
RNA_gene['Method'] = RNA_gene['Method'].str.lower().str.split("//")
RNA_gene = RNA_gene.explode('Method')
RNA_gene = pd.merge(RNA_gene, location_map, right_on='0_y', left_on='Method', how='left')
RNA_gene['0_x'] = RNA_gene['0_x'].fillna(RNA_gene['Method'])
RNA_gene = RNA_gene.drop(columns=['0_y', 'Method'])
RNA_gene = RNA_gene.rename(columns={'0_x':'Method'})

RNA_gene['Source'] = 'RNAInter'
RNA_gene.rename(columns={'Raw_ID1':':START_ID', 'Raw_ID2':':END_ID', 'score':'RNAsister_score'},inplace=True)
RNA_gene.head(n=3)

* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv')
lncRNA_gene = LncRNAWiki[LncRNAWiki['target_type'].notna()]
lncRNA_gene = lncRNA_gene[lncRNA_gene.target_type.str.contains('PCG')]
lncRNA_gene = lncRNA_gene.drop(columns=['symbol','synonyms','gene_locus','gene_id','conservation_ortholog','clinical_detail','biological_process','pathway',
                                        'modification_detail','target_interaction', 'conservation_species','target_effect','epigenetic_modification',
                                        'description','conservation','target_type','biological_context','regulator_effect','expression_detail',
                                        'regulator_interaction', 'genome_variation', 'variation_detail', 'molecular_function',
                                        'expression','regulator_type','functional_mechanism'])
        
lncRNA_gene['transcript_id'] = lncRNA_gene['transcript_id'].str.split(',')
lncRNA_gene = lncRNA_gene.explode('transcript_id')
lncRNA_gene = lncRNA_gene[lncRNA_gene['transcript_id'].notna()]

lncRNA_gene = pd.merge(lncRNA_gene, rnacentral_map_human_lncbook[['LncBook Transcript ID', 'RNAcentral ID']].drop_duplicates().rename(
    columns={'LncBook Transcript ID':'transcript_id'}), on = 'transcript_id').drop(columns=['transcript_id']).rename(
        columns={'RNAcentral ID':'RNA'})  

lncRNA_gene = pd.merge(lncRNA_gene,symbol_entrez_map.rename(columns={0:'target'}), on='target').drop(columns=['target']).rename(columns={1:'Gene'})
lncRNA_gene = lncRNA_gene.fillna('nan')
lncRNA_gene = lncRNA_gene[lncRNA_gene['RNA'].notna()]
lncRNA_gene = lncRNA_gene[lncRNA_gene['Gene'].notna()]

lncRNA_gene['pmid'] = pd.to_numeric(lncRNA_gene['pmid'], errors='coerce')
lncRNA_gene['pmid'] = lncRNA_gene['pmid'].astype(str)
lncRNA_gene['pmid'] = lncRNA_gene['pmid'].str.replace(".0", "")
lncRNA_gene['pmid'] = lncRNA_gene['pmid'].replace("nan", np.nan)

lncRNA_gene['drug'] = lncRNA_gene['drug'].str.lower().str.split(";")
lncRNA_gene = lncRNA_gene.explode('drug')

lncRNA_gene['regulator'] = lncRNA_gene['regulator'].str.lower().str.split(";")
lncRNA_gene = lncRNA_gene.explode('regulator')

lncRNA_gene['experimental_method'] = lncRNA_gene['experimental_method'].str.lower().str.split(";")
lncRNA_gene = lncRNA_gene.explode('experimental_method')
lncRNA_gene = pd.merge(lncRNA_gene, method_map, right_on='0_y', left_on='experimental_method', how='left')
lncRNA_gene['0_x'] = lncRNA_gene['0_x'].fillna(lncRNA_gene['experimental_method'])
lncRNA_gene = lncRNA_gene.drop(columns=['0_y', 'experimental_method'])
lncRNA_gene = lncRNA_gene.rename(columns={'0_x':'Method','pmid':'PubMedID', 'drug':'Drug', 'regulator':'Regulator', 'target':'Interactor'})

lncRNA_gene['tissue/cell line'] = lncRNA_gene['tissue/cell line'].str.lower().str.split(";")
lncRNA_gene = lncRNA_gene.explode('tissue/cell line')
lncRNA_gene = pd.merge(lncRNA_gene, location_map, right_on='0_y', left_on='tissue/cell line', how='left')
lncRNA_gene['0_x'] = lncRNA_gene['0_x'].fillna(lncRNA_gene['tissue/cell line'])
lncRNA_gene = lncRNA_gene.drop(columns=['0_y', 'tissue/cell line'])
lncRNA_gene = lncRNA_gene.rename(columns={'0_x':'Location'})

lncRNA_gene['context_detail'] = lncRNA_gene['context_detail'].str.lower().str.split(";")
lncRNA_gene = lncRNA_gene.explode('context_detail')
lncRNA_gene = pd.merge(lncRNA_gene, disease_map, right_on='0_y', left_on='context_detail', how='left')
lncRNA_gene['0_x'] = lncRNA_gene['0_x'].fillna(lncRNA_gene['context_detail'])
lncRNA_gene = lncRNA_gene.drop(columns=['0_y', 'context_detail'])
lncRNA_gene = lncRNA_gene.rename(columns={'0_x':'Location2'})

lncRNA_gene = pd.concat([lncRNA_gene.drop(columns=['Location2']), lncRNA_gene.drop(columns=['Location']).rename(columns={'Location2':'Location'})])

lncRNA_gene['Source'] = 'LncRNAWiki'
lncRNA_gene.rename(columns={'RNA':':START_ID', 'Gene':':END_ID'},inplace=True)
lncRNA_gene.head(n=3)

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoDB = pd.read_csv(unprocessed_data_location + 'download_all', sep="\t")
snoDB = snoDB[['rna_central_id','host_gene_id','rrna_targets','snrna_targets','lncrna_targets','protein_coding_targets','snorna_targets',
               'mirna_targets','trna_targets','ncrna_targets','pseudogene_targets','other_targets','is_expressed']]
snoDB = snoDB[snoDB['rna_central_id'].notna()]
snoDB = snoDB.rename(columns={'rna_central_id':'RNA'})
for col in snoDB.columns:
    snoDB[col] = snoDB[col].astype(str).str.split(';')
for col in snoDB.columns:
    snoDB = snoDB.explode(col) 
snoDB['Source'] = 'snoDB'
snoDB.head(n=3)

snoRNA_pcg = snoDB[['RNA', 'protein_coding_targets', 'Source']]
snoRNA_pcg = snoRNA_pcg[snoRNA_pcg['protein_coding_targets'] != 'nan']
snoRNA_pcg['protein_coding_targets'] = snoRNA_pcg['protein_coding_targets'].str.split('.').str[0]

snoRNA_pcg = pd.merge(snoRNA_pcg, symbol_entrez_map.rename(columns={0:'protein_coding_targets'})).drop(
    columns=['protein_coding_targets']).rename(columns={1:'Gene'})

snoRNA_pcg.head(n=3)

In [None]:
snoRNA_pseudogene = snoDB[['RNA', 'pseudogene_targets', 'Source']]
snoRNA_pseudogene = snoRNA_pseudogene[snoRNA_pseudogene['pseudogene_targets'] != 'nan']
snoRNA_pseudogene['pseudogene_targets'] = snoRNA_pseudogene['pseudogene_targets'].str.split('.').str[0]

snoRNA_pseudogene = pd.merge(snoRNA_pseudogene, symbol_entrez_map.rename(columns={0:'pseudogene_targets'})).drop(
    columns=['pseudogene_targets']).rename(columns={1:'Gene'})

snoRNA_pseudogene.head(n=3) # Empty

In [None]:
RNA_interacts_with_Gene = pd.concat([RNA_gene, lncRNA_gene, snoRNA_pcg.rename(columns={'RNA':':START_ID','Gene':':END_ID'})])
RNA_interacts_with_Gene = RNA_interacts_with_Gene.groupby([':START_ID',':END_ID']).agg(
    {'RNAsister_score':np.mean,'Location':set,'Method':set,'Source':set, 'Regulator':set,'Drug':set,'PubMedID':set}).reset_index()
RNA_interacts_with_Gene[':TYPE'] = 'interacts_with'
RNA_interacts_with_Gene.to_pickle(unprocessed_edge_data_location+'RNA_interacts_with_gene.pkl')

Gene_interacts_with_RNA = RNA_interacts_with_Gene.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
Gene_interacts_with_RNA.to_pickle(unprocessed_edge_data_location+'gene_interacts_with_RNA.pkl')
Gene_interacts_with_RNA.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002326 (contributes to) - OBO

* [The GO resource](https://geneontology.org/) (GO annotations) <br/> The Gene Ontology (GO) knowledgebase is the world’s largest source of information on the functions of genes. This knowledge is both human-readable and machine-readable, and is a foundation for computational analysis of large-scale molecular biology and genetics experiments in biomedical research. 

In [None]:
!wget http://current.geneontology.org/annotations/goa_human_rna.gaf.gz -O ../resources/processed_data/unprocessed_data/goa_human_rna.gaf.gz

In [None]:
import gzip 
with gzip.open(unprocessed_data_location+'goa_human_rna.gaf.gz') as f: # GO
    go_annotations = pd.read_csv(f, comment='!', delimiter='\t', names=[
        'DB', 'DB Object ID', 'DB Object Symbol', 'Qualifier', 'GO ID', 'DB:Reference (|DB:Reference)', 'Evidence Code',
        'With (or) From', 'Aspect', 'DB Object Name', 'DB Object Synonym (|Synonym)', 'DB Object Type', 'Taxon(|taxon)',
        'Date', 'Assigned By', 'Annotation Extension', 'Gene Product Form ID'])

go_annotations = go_annotations[go_annotations['Taxon(|taxon)'] == 'taxon:9606']
go_annotations.head(n=2)

In [None]:
print(go_annotations['Qualifier'].unique())
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('enables', '2327') # RO_0002327
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('involved_in', '2331')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('located_in', '1025')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('part_of', 'BFO50')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('acts_upstream_of', '2263')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('acts_upstream_of_or_within', '2264')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('is_active_in', '2432')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('acts_upstream_of_or_within_negative_effect', '4033')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('acts_upstream_of_negative_effect', '4035')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('colocalizes_with', '2325')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('contributes_to', '2326')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('NOT|involved_in', '2331?NOT')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('NOT|located_in', '1025?NOT')
print(go_annotations['Qualifier'].unique())

In [None]:
print("Are all 'DB Object ID' and 'DB Object Symbol' cells equal?", all(go_annotations['DB Object ID'] == go_annotations['DB Object Symbol']))
go_annotations = go_annotations.drop(columns=['DB Object Symbol'])

In [None]:
go_annotations['GO ID'] = go_annotations['GO ID'].str.replace('GO:', 'GO_')
go_annotations['DB Object ID'] = go_annotations['DB Object ID'].str.replace('_.*', '', regex=True)
go_annotations = go_annotations.drop(columns=['Gene Product Form ID', 'DB Object Synonym (|Synonym)', 'DB','Date','Annotation Extension',
                                              'With (or) From','DB Object Name',
                                              'DB Object Type','Taxon(|taxon)',"Aspect"])

print(go_annotations[go_annotations['DB Object ID'].isna()])
go_annotations.head(n=3)

In [None]:
go_annotations['Assigned By'] = "GOC|" + go_annotations['Assigned By'].astype(str)
go_annotations['Assigned By'] = go_annotations['Assigned By'].replace("GOC|GOC", "GOC")
go_annotations['Assigned By'] = go_annotations['Assigned By'].replace("GOC|BHF-UCL", "GOC")
go_annotations['Assigned By'] = go_annotations['Assigned By'].replace("GOC|ARUK-UCL", "GOC")
go_annotations['Assigned By'] = go_annotations['Assigned By'].replace("GOC|DIBU", "GOC")
go_annotations['Assigned By'] = go_annotations['Assigned By'].replace("GOC|ParkinsonsUK-UCL", "GOC")
go_annotations['Assigned By'] = go_annotations['Assigned By'].replace("GOC|FlyBase", "GOC")
go_annotations['Assigned By'] = go_annotations['Assigned By'].replace("GOC|UniProt", "GOC|UniProtKB")
go_annotations['Assigned By'].unique()

In [None]:
go_annotations['DB:Reference (|DB:Reference)'] = go_annotations['DB:Reference (|DB:Reference)'].str.split('PMID:').str[1]
go_annotations['DB:Reference (|DB:Reference)'] = pd.to_numeric(go_annotations['DB:Reference (|DB:Reference)'], errors='coerce')
go_annotations['DB:Reference (|DB:Reference)'] = go_annotations['DB:Reference (|DB:Reference)'].replace(0, np.nan)
go_annotations['DB:Reference (|DB:Reference)'] = go_annotations['DB:Reference (|DB:Reference)'].replace("<NA>", np.nan)
go_annotations['DB:Reference (|DB:Reference)'] = pd.to_numeric(go_annotations['DB:Reference (|DB:Reference)'], errors='coerce')
go_annotations['DB:Reference (|DB:Reference)'] = go_annotations['DB:Reference (|DB:Reference)'].astype(str)
go_annotations['DB:Reference (|DB:Reference)'] = go_annotations['DB:Reference (|DB:Reference)'].str.replace(".0", "")
go_annotations['DB:Reference (|DB:Reference)'] = go_annotations['DB:Reference (|DB:Reference)'].replace("<NA>","nan")
go_annotations['DB:Reference (|DB:Reference)'] = go_annotations['DB:Reference (|DB:Reference)'].replace("nan",np.nan)

go_annotations.rename(columns={'DB Object ID':':START_ID','GO ID':':END_ID','Evidence Code':'GO_evidence',
                               'Assigned By':'Source','DB:Reference (|DB:Reference)':'PubMedID'},inplace=True)
go_annotations['Source'] = go_annotations['Source'].str.split('|')
go_annotations = go_annotations.explode('Source')
go_annotations.head(n=3)

In [None]:
go_annotations2326 = go_annotations[go_annotations['Qualifier'] == '2326'].drop(columns=['Qualifier'])
go_annotations2326 = go_annotations2326.groupby([':START_ID',':END_ID']).agg({'GO_evidence':set,'Source':set,'PubMedID':set}).reset_index()
go_annotations2326[':TYPE'] = 'contributes_to'
go_annotations2326.to_pickle(unprocessed_edge_data_location+'RNA_contributes_to_OBO.pkl')
go_annotations2326.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0003304 (contributes to condition) - OBO

* [LncRNADisease](http://www.rnanut.net/lncrnadisease/) <br /> LncRNADisease integrates comprehensive experimentally supported and predicted ncRNA-disease associations curated from manual literatures and other resources.

In [None]:
data_downloader('http://www.rnanut.net/lncrnadisease/static/download/website_alldata.tsv', unprocessed_data_location)

In [None]:
lncRNADisease = pd.read_csv(unprocessed_data_location + 'website_alldata.tsv', sep="\t") # Mondo+HPO
# We keep only rows dealing with HS
lncRNA_disease_contributes = lncRNADisease[(lncRNADisease['Species'].str.contains("sapiens")) & (lncRNADisease['Causality'] == 'No')]
lncRNA_disease_contributes.drop(columns=['Causality','Species','Description','Dysfunction Pattern',
                                         'Clinical Application','Causal Description'],inplace=True)
circRNADisease = lncRNA_disease_contributes[lncRNA_disease_contributes['ncRNA Category'] == 'CircRNA']
circRNADisease[':START_ID'] = circRNADisease['ncRNA Symbol'].str.strip().str.lower()
circRNADisease['Disease Name'] = circRNADisease['Disease Name'].str.strip().str.lower()

circRNADisease = pd.merge(circRNADisease, desc_disPhe_map.rename(columns={0:'Disease Name'}),
                                        on='Disease Name').drop(columns=['Disease Name']).rename(columns={1:':END_ID'})

circRNADisease['PubMed ID'] = pd.to_numeric(circRNADisease['PubMed ID'], errors='coerce')
circRNADisease['PubMed ID'] = circRNADisease['PubMed ID'].astype(str)
circRNADisease['PubMed ID'] = circRNADisease['PubMed ID'].str.replace(".0", "")
circRNADisease['PubMed ID'] = circRNADisease['PubMed ID'].replace("nan", np.nan)

circRNADisease['Validated Method'] = circRNADisease['Validated Method'].str.strip().str.split("//")
circRNADisease = circRNADisease.explode('Validated Method')
circRNADisease['Validated Method'] = circRNADisease['Validated Method'].str.lower().str.split(";")
circRNADisease = circRNADisease.explode('Validated Method')
circRNADisease = pd.merge(circRNADisease, method_map, right_on='0_y', left_on='Validated Method', how='left')
circRNADisease['0_x'] = circRNADisease['0_x'].fillna(circRNADisease['Validated Method'])
circRNADisease = circRNADisease.drop(columns=['0_y', 'Validated Method'])
circRNADisease = circRNADisease.rename(columns={'0_x':'Method','PubMed ID':'PubMedID'})

circRNADisease['Sample'] = circRNADisease['Sample'].str.strip().str.lower().str.split(";")
circRNADisease = circRNADisease.explode('Sample')
circRNADisease = pd.merge(circRNADisease, location_map, right_on='0_y', left_on='Sample', how='left')
circRNADisease['0_x'] = circRNADisease['0_x'].fillna(circRNADisease['Sample'])
circRNADisease = circRNADisease.drop(columns=['0_y', 'Sample'])
circRNADisease = circRNADisease.rename(columns={'0_x':'Location'})
circRNADisease['Source'] = 'LncRNADisease'
circRNADisease.head(n=3)

In [None]:
lncRNADisease = pd.read_csv(unprocessed_data_location + 'website_alldata.tsv', sep="\t") # Mondo+HPO
# We keep only rows dealing with HS
lncRNA_disease_contributes = lncRNADisease[(lncRNADisease['Species'].str.contains("sapiens")) & (lncRNADisease['Causality'] == 'No')]
lncRNA_disease_contributes.drop(columns=['Causality','Species','Description','Dysfunction Pattern',
                                         'Clinical Application','Causal Description'],inplace=True)
lncRNA_disease_contributes = lncRNA_disease_contributes[lncRNA_disease_contributes['ncRNA Category'] == 'LncRNA']
lncRNA_disease_contributes['ncRNA Symbol'] = lncRNA_disease_contributes['ncRNA Symbol'].str.strip().str.upper()

lncRNA_disease_contributes = pd.merge(lncRNA_disease_contributes, rnacentral_map_human_hgnc[
    rnacentral_map_human_hgnc['RNA category'] == 'lncRNA'][['RNAcentral ID', 'HGNC symbol']].drop_duplicates().rename(
        columns={'HGNC symbol':'ncRNA Symbol'}),on='ncRNA Symbol')

lncRNA_disease_contributes['Disease Name'] = lncRNA_disease_contributes['Disease Name'].str.strip().str.lower()

lncRNA_disease_contributes = pd.merge(lncRNA_disease_contributes, desc_disPhe_map.rename(columns={0:'Disease Name'}),
                                        on='Disease Name').drop(columns=['Disease Name']).rename(columns={1:':END_ID'})

lncRNA_disease_contributes['PubMed ID'] = pd.to_numeric(lncRNA_disease_contributes['PubMed ID'], errors='coerce')
lncRNA_disease_contributes['PubMed ID'] = lncRNA_disease_contributes['PubMed ID'].astype(str)
lncRNA_disease_contributes['PubMed ID'] = lncRNA_disease_contributes['PubMed ID'].str.replace(".0", "")
lncRNA_disease_contributes['PubMed ID'] = lncRNA_disease_contributes['PubMed ID'].replace("nan", np.nan)

lncRNA_disease_contributes['Validated Method'] = lncRNA_disease_contributes['Validated Method'].str.strip().str.split("//")
lncRNA_disease_contributes = lncRNA_disease_contributes.explode('Validated Method')
lncRNA_disease_contributes['Validated Method'] = lncRNA_disease_contributes['Validated Method'].str.lower().str.split(";")
lncRNA_disease_contributes = lncRNA_disease_contributes.explode('Validated Method')
lncRNA_disease_contributes = pd.merge(lncRNA_disease_contributes, method_map, right_on='0_y', left_on='Validated Method', how='left')
lncRNA_disease_contributes['0_x'] = lncRNA_disease_contributes['0_x'].fillna(lncRNA_disease_contributes['Validated Method'])
lncRNA_disease_contributes = lncRNA_disease_contributes.drop(columns=['0_y', 'Validated Method'])
lncRNA_disease_contributes = lncRNA_disease_contributes.rename(columns={'0_x':'Method','PubMed ID':'PubMedID'})

lncRNA_disease_contributes['Sample'] = lncRNA_disease_contributes['Sample'].str.lower().str.strip().str.split(";")
lncRNA_disease_contributes = lncRNA_disease_contributes.explode('Sample')
lncRNA_disease_contributes = pd.merge(lncRNA_disease_contributes, location_map, right_on='0_y', left_on='Sample', how='left')
lncRNA_disease_contributes['0_x'] = lncRNA_disease_contributes['0_x'].fillna(lncRNA_disease_contributes['Sample'])
lncRNA_disease_contributes = lncRNA_disease_contributes.drop(columns=['0_y', 'Sample'])
lncRNA_disease_contributes = lncRNA_disease_contributes.rename(columns={'0_x':'Location'})

lncRNA_disease_contributes['Source'] = 'NONCODE, LncRNADisease'
lncRNA_disease_contributes['Source'] = lncRNA_disease_contributes['Source'].str.split(', ')
lncRNA_disease_contributes = lncRNA_disease_contributes.explode('Source')
lncRNA_disease_contributes.rename(columns={'RNAcentral ID':':START_ID', 'Disease':':END_ID'},inplace=True)
lncRNA_disease_contributes.head(n=3)

In [None]:
RNA_contributes_to_condition_OBO = pd.concat([lncRNA_disease_contributes, circRNADisease])
RNA_contributes_to_condition_OBO = RNA_contributes_to_condition_OBO.groupby([':START_ID',':END_ID']).agg(
    {'PubMedID':set,'Method':set,'Location':set,'Source':set}).reset_index()
RNA_contributes_to_condition_OBO[":TYPE"] = "contributes_to_condition"
RNA_contributes_to_condition_OBO.to_pickle(unprocessed_edge_data_location+'RNA_contributes_to_condition_OBO.pkl')
RNA_contributes_to_condition_OBO.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0003303 (causes condition) - OBO

* [LncRNADisease](http://www.rnanut.net/lncrnadisease/) <br /> LncRNADisease integrates comprehensive experimentally supported and predicted ncRNA-disease associations curated from manual literatures and other resources.

In [None]:
lncRNADisease = pd.read_csv(unprocessed_data_location + 'website_alldata.tsv', sep="\t") # Mondo+HPO
# We keep only rows dealing with HS
lncRNA_disease_contributes = lncRNADisease[(lncRNADisease['Species'].str.contains("sapiens")) & (lncRNADisease['Causality'] == 'Yes')]
lncRNA_disease_contributes.drop(columns=['Causality','Species','Description','Dysfunction Pattern',
                                         'Clinical Application','Causal Description'],inplace=True)
circRNADisease = lncRNA_disease_contributes[lncRNA_disease_contributes['ncRNA Category'] == 'CircRNA']
circRNADisease[':START_ID'] = circRNADisease['ncRNA Symbol'].str.strip().str.lower()
circRNADisease['Disease Name'] = circRNADisease['Disease Name'].str.strip().str.lower()

circRNADisease = pd.merge(circRNADisease, desc_disPhe_map.rename(columns={0:'Disease Name'}),
                                        on='Disease Name').drop(columns=['Disease Name']).rename(columns={1:':END_ID'})

circRNADisease['PubMed ID'] = pd.to_numeric(circRNADisease['PubMed ID'], errors='coerce')
circRNADisease['PubMed ID'] = circRNADisease['PubMed ID'].astype(str)
circRNADisease['PubMed ID'] = circRNADisease['PubMed ID'].str.replace(".0", "")
circRNADisease['PubMed ID'] = circRNADisease['PubMed ID'].replace("nan", np.nan)

circRNADisease['Validated Method'] = circRNADisease['Validated Method'].str.strip().str.split("//")
circRNADisease = circRNADisease.explode('Validated Method')
circRNADisease['Validated Method'] = circRNADisease['Validated Method'].str.lower().str.split(";")
circRNADisease = circRNADisease.explode('Validated Method')
circRNADisease = pd.merge(circRNADisease, method_map, right_on='0_y', left_on='Validated Method', how='left')
circRNADisease['0_x'] = circRNADisease['0_x'].fillna(circRNADisease['Validated Method'])
circRNADisease = circRNADisease.drop(columns=['0_y', 'Validated Method'])
circRNADisease = circRNADisease.rename(columns={'0_x':'Method','PubMed ID':'PubMedID'})

circRNADisease['Sample'] = circRNADisease['Sample'].str.strip().str.lower().str.split(";")
circRNADisease = circRNADisease.explode('Sample')
circRNADisease = pd.merge(circRNADisease, location_map, right_on='0_y', left_on='Sample', how='left')
circRNADisease['0_x'] = circRNADisease['0_x'].fillna(circRNADisease['Sample'])
circRNADisease = circRNADisease.drop(columns=['0_y', 'Sample'])
circRNADisease = circRNADisease.rename(columns={'0_x':'Location'})
circRNADisease['Source'] = 'LncRNADisease'
circRNADisease.head(n=3)

In [None]:
lncRNADisease = pd.read_csv(unprocessed_data_location + 'website_alldata.tsv', sep="\t") # Mondo+HPO
# We keep only rows dealing with HS
lncRNA_disease_contributes = lncRNADisease[(lncRNADisease['Species'].str.contains("sapiens")) & (lncRNADisease['Causality'] == 'Yes')]
lncRNA_disease_contributes.drop(columns=['Causality','Species','Description','Dysfunction Pattern',
                                         'Clinical Application','Causal Description'],inplace=True)
lncRNA_disease_contributes = lncRNA_disease_contributes[lncRNA_disease_contributes['ncRNA Category'] == 'LncRNA']
lncRNA_disease_contributes['ncRNA Symbol'] = lncRNA_disease_contributes['ncRNA Symbol'].str.strip().str.upper()

lncRNA_disease_contributes = pd.merge(lncRNA_disease_contributes, rnacentral_map_human_hgnc[
    rnacentral_map_human_hgnc['RNA category'] == 'lncRNA'][['RNAcentral ID', 'HGNC symbol']].drop_duplicates().rename(
        columns={'HGNC symbol':'ncRNA Symbol'}),on='ncRNA Symbol')

lncRNA_disease_contributes['Disease Name'] = lncRNA_disease_contributes['Disease Name'].str.strip().str.lower()

lncRNA_disease_contributes = pd.merge(lncRNA_disease_contributes, desc_disPhe_map.rename(columns={0:'Disease Name'}),
                                        on='Disease Name').drop(columns=['Disease Name']).rename(columns={1:':END_ID'})

lncRNA_disease_contributes['PubMed ID'] = pd.to_numeric(lncRNA_disease_contributes['PubMed ID'], errors='coerce')
lncRNA_disease_contributes['PubMed ID'] = lncRNA_disease_contributes['PubMed ID'].astype(str)
lncRNA_disease_contributes['PubMed ID'] = lncRNA_disease_contributes['PubMed ID'].str.replace(".0", "")
lncRNA_disease_contributes['PubMed ID'] = lncRNA_disease_contributes['PubMed ID'].replace("nan", np.nan)

lncRNA_disease_contributes['Validated Method'] = lncRNA_disease_contributes['Validated Method'].str.strip().str.split("//")
lncRNA_disease_contributes = lncRNA_disease_contributes.explode('Validated Method')
lncRNA_disease_contributes['Validated Method'] = lncRNA_disease_contributes['Validated Method'].str.lower().str.split(";")
lncRNA_disease_contributes = lncRNA_disease_contributes.explode('Validated Method')
lncRNA_disease_contributes = pd.merge(lncRNA_disease_contributes, method_map, right_on='0_y', left_on='Validated Method', how='left')
lncRNA_disease_contributes['0_x'] = lncRNA_disease_contributes['0_x'].fillna(lncRNA_disease_contributes['Validated Method'])
lncRNA_disease_contributes = lncRNA_disease_contributes.drop(columns=['0_y', 'Validated Method'])
lncRNA_disease_contributes = lncRNA_disease_contributes.rename(columns={'0_x':'Method','PubMed ID':'PubMedID'})

lncRNA_disease_contributes['Sample'] = lncRNA_disease_contributes['Sample'].str.lower().str.strip().str.split(";")
lncRNA_disease_contributes = lncRNA_disease_contributes.explode('Sample')
lncRNA_disease_contributes = pd.merge(lncRNA_disease_contributes, location_map, right_on='0_y', left_on='Sample', how='left')
lncRNA_disease_contributes['0_x'] = lncRNA_disease_contributes['0_x'].fillna(lncRNA_disease_contributes['Sample'])
lncRNA_disease_contributes = lncRNA_disease_contributes.drop(columns=['0_y', 'Sample'])
lncRNA_disease_contributes = lncRNA_disease_contributes.rename(columns={'0_x':'Location'})

lncRNA_disease_contributes['Source'] = 'NONCODE, LncRNADisease'
lncRNA_disease_contributes['Source'] = lncRNA_disease_contributes['Source'].str.split(', ')
lncRNA_disease_contributes = lncRNA_disease_contributes.explode('Source')
lncRNA_disease_contributes.rename(columns={'RNAcentral ID':':START_ID', 'Disease':':END_ID'},inplace=True)
lncRNA_disease_contributes.head(n=3)

In [None]:
RNA_causes_condition_OBO = pd.concat([lncRNA_disease_contributes, circRNADisease])
RNA_causes_condition_OBO = RNA_causes_condition_OBO.groupby([':START_ID',':END_ID']).agg(
    {'PubMedID':set,'Method':set,'Location':set,'Source':set}).reset_index()
RNA_causes_condition_OBO[":TYPE"] = "causes_condition"
RNA_causes_condition_OBO.to_pickle(unprocessed_edge_data_location+'RNA_causes_condition_OBO.pkl')
RNA_causes_condition_OBO.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002434 (interacts with) - OBO

* ViRBase

In [None]:
ViRBase = pd.read_csv(unprocessed_data_location+'all_ncRNA_associated_interactions.txt',sep='\t') # PRO
# We select only strong evidence interactions for hsa
ViRBase = ViRBase[ViRBase['Host Species'].str.contains('apiens')]
# We keep only entries score is > 0.7 (see http://www.rna-society.org/virbase/help.html Q8)
ViRBase = ViRBase[ViRBase['Score']>=0.7]
ViRBase['PMID'] = pd.to_numeric(ViRBase['PMID'], errors='coerce').astype('Int64').astype(str)
ViRBase.drop(columns=['ViRBase ID','Virus Name','Virus Strain Name','Virus Family',
                      'Host Species','Interactor1 Symbol','Interactor2 Symbol'],inplace=True)
ViRBase.head(n=3)

In [None]:
host_virus = ViRBase[(ViRBase['Interactor1 Source'] == 'host') & (ViRBase['Interactor2 Source'] == 'virus')]
virus_host = ViRBase[(ViRBase['Interactor1 Source'] == 'virus') & (ViRBase['Interactor2 Source'] == 'host')]
virus_host.rename(columns={'Interactor1 Source':'Interactor2 Source',
                           'Interactor1 Category':'Interactor2 Category',
                           'Interactor1 ID':'Interactor2 ID',
                           'Interactor2 Source':'Interactor1 Source',
                           'Interactor2 Category':'Interactor1 Category',
                           'Interactor2 ID':'Interactor1 ID'
                          },inplace=True)
ViRBase = pd.concat([virus_host,host_virus])
ViRBase['Interactor1 Category'] = ViRBase['Interactor1 Category'].str.replace(' ','_')
ViRBase['Interactor2 Category'] = ViRBase['Interactor2 Category'].str.replace(' ','_')
ViRBase.drop(columns=['Interactor2 Source','Interactor1 Source'],inplace=True)
ViRBase.head(n=3)

In [None]:
print(ViRBase['Taxonomy ID'].unique())

In [None]:
set(ViRBase['Interactor1 Category'])

In [None]:
i = ViRBase[(ViRBase['Interactor1 Category']=='protein')].index.values
ViRBase_pro = ViRBase[(ViRBase['Interactor1 Category']=='protein')]
entrez_pro_map[0] = entrez_pro_map[0].astype(str)
ViRBase_pro = pd.merge(ViRBase_pro, entrez_pro_map.rename(columns={0:'Interactor1 ID'}), on=['Interactor1 ID'])
ViRBase_pro.drop(columns=['Interactor1 ID'],inplace=True)
ViRBase_pro.rename(columns={1:'Interactor1 ID'}, inplace=True)
ViRBase.drop(index=i,inplace=True,axis=0)
ViRBase = pd.concat([ViRBase_pro,ViRBase])
ViRBase_pro.head(n=3)

In [None]:
viralprotein_RNA = ViRBase[(ViRBase['Interactor2 Category']=='protein')]
                           
viralprotein_RNA['Interactor2 ID'].replace({'1489078':'PR_P03126',
                                            '1489080':'PR_P03120',
                                            # Only papilloma type 16 is considered in PRO
                                            '3783750':'PR_P03230',
                                            '3783774':'PR_P03211',
                                            '944566':'PR_000008466',
                                            '944568':'PR_P0C6K0',
                                            'E':'PR_000036822',
                                            'M1':'PR_000049763',
                                            'NP':'PR_000049760',
                                            'NS1':'PR_000036824',
                                            'NS3':'PR_000036828',
                                            'P40':'PR_000038390',
                                            'PB1':'PR_000049745',
                                            'Pol':'PR_000044455',
                                            'env':'PR_000003225',
                                            'gag':'PR_000048976',
                                            'NS5A':'PR_000012225',
                                            'pp65':'PR_000009732',
                                            '3783751':'PR_P03230',
                                            'E7':'PR_P03126',
                                            '1489079':'PR_P03126',
                                            '1489088':'PR_P03126',
                                            '1489089':'PR_P03126',
                                            '1489085':'PR_P03120',
                                            'Bet':'PR_000004725',
                                            '156110':'PR_000011120, PR_000011118, PR_000011119',
                                            'N':'PR_000050262'},inplace=True)
viralprotein_RNA['Interactor2 ID'] = viralprotein_RNA['Interactor2 ID'].str.split(', ')
viralprotein_RNA = viralprotein_RNA.explode('Interactor2 ID')
i = ViRBase[(ViRBase['Interactor2 Category']=='protein')].index.values
ViRBase.drop(index=i,inplace=True,axis=0)
ViRBase = pd.concat([viralprotein_RNA,ViRBase])
viralprotein_RNA.head(n=3)

In [None]:
ViRBase[(ViRBase['Interactor1 Category']=='miRNA')]['Interactor1 ID'].str[:3].unique()

In [None]:
i = ViRBase[(ViRBase['Interactor1 Category']=='miRNA')].index.values
ViRBase_mirna = ViRBase[(ViRBase['Interactor1 Category']=='miRNA')]
ViRBase_mirna = pd.merge(ViRBase_mirna, rnacentral_map_mirbase.rename(columns={'miRBase ID':'Interactor1 ID'}),
                         on=['Interactor1 ID']).drop(columns=['Interactor1 ID','DB','Organism','miRNA category','Label']).rename(
                             columns={'RNAcentral ID':'Interactor1 ID'})
ViRBase.drop(index=i,inplace=True,axis=0)
ViRBase = pd.concat([ViRBase_mirna,ViRBase])

i = ViRBase[(ViRBase['Interactor2 Category']=='miRNA')].index.values
ViRBase_mirna = ViRBase[(ViRBase['Interactor2 Category']=='miRNA')]
ViRBase_mirna = pd.merge(ViRBase_mirna, rnacentral_map_mirbase.rename(columns={'miRBase ID':'Interactor2 ID'}),
                         on=['Interactor2 ID']).drop(columns=['Interactor2 ID','DB','Organism','miRNA category','Label']).rename(
                             columns={'RNAcentral ID':'Interactor2 ID'})
ViRBase.drop(index=i,inplace=True,axis=0)
ViRBase = pd.concat([ViRBase_mirna,ViRBase])
ViRBase_mirna.head(n=3)

In [None]:
ViRBase = pd.merge(ViRBase, symbol_entrez_map, left_on=['Interactor1 ID'], right_on=0,how='left')
ViRBase[1] = ViRBase[1].fillna(ViRBase['Interactor1 ID'])
ViRBase.drop(columns=[0,'Interactor1 ID'],inplace=True)
ViRBase.rename(columns={1:'Interactor1 ID'},inplace=True)

ViRBase = pd.merge(ViRBase, symbol_entrez_map, left_on=['Interactor2 ID'], right_on=0,how='left')
ViRBase[1] = ViRBase[1].fillna(ViRBase['Interactor2 ID'])
ViRBase.drop(columns=[0,'Interactor2 ID'],inplace=True)
ViRBase.rename(columns={1:'Interactor2 ID'},inplace=True)

i = ViRBase[(ViRBase['Interactor1 Category']!='miRNA') & (ViRBase['Interactor1 Category']!='protein')
             & (ViRBase['Interactor1 Category']!='pseudo') & (ViRBase['Interactor1 Category']!='mRNA')].index.values
ViRBase_ncrna = ViRBase[(ViRBase['Interactor1 Category']!='miRNA') & (ViRBase['Interactor1 Category']!='protein')
                        & (ViRBase['Interactor1 Category']!='pseudo') & (ViRBase['Interactor1 Category']!='mRNA')]
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ViRBase_ncrna = pd.merge(ViRBase_ncrna, ensembl_entrezTranscript_map,
                         left_on=['Interactor1 ID','Interactor1 Category'],
                         right_on=[0,2]).drop(columns=['Interactor1 ID',2,3,4,5]).rename(columns={0:'Interactor1 ID'})
ViRBase_ncrna.drop(columns=['Interactor1 ID'],inplace=True)
ViRBase_ncrna.rename(columns={1:'Interactor1 ID'}, inplace=True)
ViRBase.drop(index=i,inplace=True,axis=0)
ViRBase = pd.concat([ViRBase_ncrna,ViRBase])

i = ViRBase[(ViRBase['Interactor2 Category']!='miRNA') & (ViRBase['Interactor2 Category']!='protein')
            & (ViRBase['Interactor2 Category']!='pseudo') & (ViRBase['Interactor2 Category']!='mRNA')].index.values
ViRBase_ncrna = ViRBase[(ViRBase['Interactor2 Category']!='miRNA') & (ViRBase['Interactor2 Category']!='protein')
                        & (ViRBase['Interactor2 Category']!='pseudo') & (ViRBase['Interactor2 Category']!='mRNA')]
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ViRBase_ncrna = pd.merge(ViRBase_ncrna, ensembl_entrezTranscript_map,
                         left_on=['Interactor2 ID','Interactor2 Category'],
                         right_on=[0,2]).drop(columns=['Interactor2 ID',2,3,4,5]).rename(columns={0:'Interactor2 ID'})
ViRBase_ncrna.drop(columns=['Interactor2 ID'],inplace=True)
ViRBase_ncrna.rename(columns={1:'Interactor2 ID'}, inplace=True)
ViRBase.drop(index=i,inplace=True,axis=0)
ViRBase = pd.concat([ViRBase_ncrna,ViRBase])

ViRBase_ncrna.head(n=3)

In [None]:
ViRBase = pd.merge(ViRBase, rnacentral_map_ensembl, left_on=['Interactor1 ID'], right_on=['Ensembl transcript ID'],
         how='left').drop(columns=['DB','Ensembl transcript ID','Organism','RNA category','Ensembl Gene ID'])
ViRBase['RNAcentral ID'] = ViRBase['RNAcentral ID'].fillna(ViRBase['Interactor1 ID'])
ViRBase.drop(columns=['Interactor1 ID'],inplace=True)
ViRBase.rename(columns={'RNAcentral ID':'Interactor1 ID'},inplace=True)

ViRBase = pd.merge(ViRBase, rnacentral_map_ensembl, left_on=['Interactor2 ID'], right_on=['Ensembl transcript ID'],
         how='left').drop(columns=['DB','Ensembl transcript ID','Organism','RNA category','Ensembl Gene ID'])
ViRBase['RNAcentral ID'] = ViRBase['RNAcentral ID'].fillna(ViRBase['Interactor1 ID'])
ViRBase.drop(columns=['Interactor1 ID'],inplace=True)
ViRBase.rename(columns={'RNAcentral ID':'Interactor1 ID'},inplace=True)
ViRBase.head(n=3)

In [None]:
ViRBase = pd.concat([ViRBase, ViRBase.rename(columns={'Interactor1 ID':'Interactor2 ID','Interactor2 ID':'Interactor1 ID'})])
ViRBase = ViRBase[(ViRBase['Interactor2 ID'].str.startswith("PR_")) & (~ViRBase['Interactor2 ID'].isna())]

ViRBase['PMID'] = pd.to_numeric(ViRBase['PMID'], errors='coerce')
ViRBase['PMID'] = ViRBase['PMID'].astype(str)
ViRBase['PMID'] = ViRBase['PMID'].str.replace(".0", "")
ViRBase['PMID'] = ViRBase['PMID'].replace("nan", np.nan)

ViRBase.rename(columns={'Interactor1 ID':':START_ID', 'Interactor2 ID':':END_ID', 'PMID':'PubMedID','Score':'RNAsister_score'},inplace=True)
ViRBase['Source'] = 'ViRBase'
ViRBase.head(n=3)

* [RNAInter](http://www.rnainter.org/) <br/> RNAInter integrates experimentally validated and computationally predicted RNA interactome data from the literature and databases.

In [None]:
!wget http://www.rnainter.org/raidMedia/download/Download_data_RC.tar.gz -O ../resources/processed_data/unprocessed_data/Download_data_RC.tar.gz

In [None]:
RNA_chemical = pd.read_csv(unprocessed_data_location+'Download_data_RC.tar.gz',sep='\t').rename(columns={'Download_data_RC.txt':'RNAInterID'})

# We select only strong evidence interactions for hsa
RNA_chemical = RNA_chemical[(RNA_chemical['score'] >= 0.2886) &
                  (RNA_chemical['Species1'].str.contains('apiens'))]

print(set(RNA_chemical.Category2)) # Chemicals are all in the second column
print(set(RNA_chemical.Category1))

RNA_chemical['Raw_ID1'] = RNA_chemical['Raw_ID1'].str.split(';')
RNA_chemical = RNA_chemical.explode('Raw_ID1')
RNA_chemical['Interactor1.Symbol'] = RNA_chemical['Interactor1.Symbol'].str.split('.').str[0]
RNA_chemical = RNA_chemical.explode('Interactor1.Symbol')
RNA_chemical['Raw_ID2'] = RNA_chemical['Raw_ID2'].str.split(';')
RNA_chemical = RNA_chemical.explode('Raw_ID2')
RNA_chemical['Raw_ID1'] = RNA_chemical['Raw_ID1'].str.strip()
RNA_chemical['Raw_ID2'] = RNA_chemical['Raw_ID2'].str.strip()

RNA_chemical = RNA_chemical[(RNA_chemical['Interactor2.Symbol'].notna())]
RNA_chemical['Interactor2.Symbol'] = RNA_chemical['Interactor2.Symbol'].str.lower()
RNA_chemical = pd.merge(RNA_chemical, desc_chebi_map.rename(columns={0: 'Interactor2.Symbol'}), on='Interactor2.Symbol',
                        how='left')
RNA_chemical[1].fillna(RNA_chemical['Interactor2.Symbol'], inplace=True)
RNA_chemical.drop(columns=['Interactor2.Symbol'],inplace=True)
RNA_chemical.rename(columns={1:'Interactor2.Symbol'},inplace=True)
RNA_chemical = pd.merge(RNA_chemical, desc_drugbank_map.rename(columns={0: 'Interactor2.Symbol'}), left_on=['Interactor2.Symbol'],
                        right_on=['Interactor2.Symbol'], how='left')
RNA_chemical[1].fillna(RNA_chemical['Interactor2.Symbol'], inplace=True)
RNA_chemical.drop(columns=['Interactor2.Symbol'],inplace=True)
RNA_chemical.rename(columns={1:'Interactor2.Symbol'},inplace=True)

RNA_chemical.Raw_ID1 = RNA_chemical.Raw_ID1.str.replace("NCBI:", '')
RNA_chemical.Raw_ID1 = RNA_chemical.Raw_ID1.str.replace("miRBase:", '')
RNA_chemical.Raw_ID1 = RNA_chemical.Raw_ID1.str.replace("Ensembl:", '')
RNA_chemical.Raw_ID1 = RNA_chemical.Raw_ID1.str.replace("circBase:", '')

RNA_chemical.head(n=3)

In [None]:
RNA_chemical = pd.merge(RNA_chemical, rnacentral_map_human_mirbase[['miRBase ID', 'RNAcentral ID']].drop_duplicates(), left_on=['Raw_ID1'],
                   right_on=['miRBase ID'], how="left").drop(columns=["miRBase ID"])

RNA_chemical['RNAcentral ID'] = RNA_chemical['RNAcentral ID'].fillna(RNA_chemical['Raw_ID1'])
RNA_chemical.drop(columns=['Raw_ID1'],inplace=True)
RNA_chemical.rename(columns={'RNAcentral ID':'Raw_ID1'},inplace=True)

RNA_chemical.head(n=3)

In [None]:
i = RNA_chemical[(RNA_chemical['Category1']=='mRNA')].index.values
mrna = RNA_chemical[RNA_chemical['Category1']=='mRNA']
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ensembl_entrezTranscript_map_mrna = ensembl_entrezTranscript_map[ensembl_entrezTranscript_map[2] == 'protein-coding']
mrna = pd.merge(mrna, ensembl_entrezTranscript_map_mrna, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1',2,3,4,5]).rename(columns={0:'Raw_ID1'})
mrna.drop(columns=['Raw_ID1'],inplace=True)
mrna.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_chemical.drop(index=i,inplace=True,axis=0)
RNA_chemical = pd.concat([mrna,RNA_chemical])

mrna.head(n=3) 

In [None]:
i = RNA_chemical[(RNA_chemical['Category1']=='pseudo')].index.values
pseudo = RNA_chemical[RNA_chemical['Category1']=='pseudo']
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ensembl_entrezTranscript_map_pseudo = ensembl_entrezTranscript_map[ensembl_entrezTranscript_map[2] == 'pseudogene']
pseudo = pd.merge(pseudo, ensembl_entrezTranscript_map_pseudo, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1',2,3,4,5]).rename(columns={0:'Raw_ID1'})
pseudo.drop(columns=['Raw_ID1'],inplace=True)
pseudo.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_chemical.drop(index=i,inplace=True,axis=0)
RNA_chemical = pd.concat([pseudo,RNA_chemical])

pseudo.head(n=3) 

In [None]:
i = RNA_chemical[(RNA_chemical['Category1']!='mRNA') & (RNA_chemical['Category1']!='pseudo')].index.values
ncrna = RNA_chemical[(RNA_chemical['Category1']!='mRNA') & (RNA_chemical['Category1']!='pseudo')]
rnacentral_map_human_hgnc_type = rnacentral_map_hgnc[rnacentral_map_hgnc['Organism'] ==9606]
ncrna = pd.merge(ncrna, rnacentral_map_human_hgnc_type, left_on=['Interactor1.Symbol','Category1'],
                right_on=["HGNC symbol",'RNA category']).drop(columns=["Interactor1.Symbol",'Raw_ID1','RNA category'])
ncrna.rename(columns={"RNAcentral ID":'Raw_ID1'}, inplace=True)
RNA_chemical.drop(index=i,inplace=True,axis=0)
RNA_chemical = pd.concat([ncrna,RNA_chemical])

ncrna.head(n=3)

In [None]:
RNA_chemical = RNA_chemical[(RNA_chemical['Raw_ID1'].str.startswith("URS")) | (RNA_chemical['Raw_ID1'].str.startswith("ENST")) | 
                            (RNA_chemical['Raw_ID1'].str.startswith("hsa_circ")) | (RNA_chemical['Raw_ID1'].str[0].str.isdigit())]
RNA_chemical = RNA_chemical[(RNA_chemical['Interactor2.Symbol'].str.startswith("DB")) | 
                            (RNA_chemical['Interactor2.Symbol'].str.startswith("CHEBI"))]

RNA_chemical['Method'] = RNA_chemical['strong'].astype(str) + '//' + RNA_chemical['weak'].astype(str) + '//' + RNA_chemical['predict'].astype(str)
RNA_chemical['Method'] = RNA_chemical['Method'].str.lower()
RNA_chemical['Method'] = RNA_chemical['Method'].str.replace('nan\/\/','',regex=True)
RNA_chemical['Method'] = RNA_chemical['Method'].str.replace('\/\/nan','',regex=True)
RNA_chemical['Method'] = RNA_chemical['Method'].replace('nan',np.nan)
RNA_chemical['Method'] = RNA_chemical['Method'].str.split('//')
RNA_chemical = RNA_chemical.explode('Method')
RNA_chemical = pd.merge(RNA_chemical, method_map, right_on='0_y', left_on='Method', how='left')
RNA_chemical['0_x'] = RNA_chemical['0_x'].fillna(RNA_chemical['Method'])
RNA_chemical = RNA_chemical.drop(columns=['0_y', 'Method'])
RNA_chemical = RNA_chemical.rename(columns={'0_x':'Method'})

RNA_chemical['Source'] = 'RNAInter'
RNA_chemical.rename(columns={'Raw_ID1':':START_ID', 'Interactor2.Symbol':':END_ID', 'score':'RNAsister_score'},inplace=True)
gene_chemical = RNA_chemical[RNA_chemical[':START_ID'].str[0].str.isdigit()]
RNA_chemical = RNA_chemical[~RNA_chemical[':START_ID'].str[0].str.isdigit()]
RNA_chemical.head(n=3)

* [RNAInter](http://www.rnainter.org/) <br/> RNAInter integrates experimentally validated and computationally predicted RNA interactome data from the literature and databases.

In [None]:
!wget http://www.rnainter.org/raidMedia/download/Download_data_RH.tar.gz -O ../resources/processed_data/unprocessed_data/Download_data_RH.tar.gz

In [None]:
RNA_hisMod = pd.read_csv(unprocessed_data_location+'Download_data_RH.tar.gz',sep='\t').rename(columns={'Download_data_RH.txt':'RNAInterID'})
# Histone modification (SO)

# We select only strong evidence interactions for hsa
RNA_hisMod = RNA_hisMod[(RNA_hisMod['score'] >= 0.2886) &
                  (RNA_hisMod['Species1'].str.contains('apiens'))]

print(set(RNA_hisMod.Category2)) # Histone modifications are all in the second column
print(set(RNA_hisMod.Category1))

RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.lower()
print(RNA_hisMod['Interactor2.Symbol'].unique())
RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.lower().str.replace('h3k9-14ac','h3k9ac, h3k14ac')
RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.lower().str.replace('hist2h3c','histone acetylation site')
#RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.lower().str.replace('h2afz','h2azac')
RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.lower().str.replace('h3ace','h3ac')
RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.lower().str.replace('h3k27me1','h2k27me1')
RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.lower().str.replace('h3.3','histone modification')
RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.lower().str.replace('h3k4','h3k4 methylation site')

RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.split(', ')
RNA_hisMod = RNA_hisMod.explode('Interactor2.Symbol')

RNA_hisMod = pd.merge(RNA_hisMod, desc_so_map, left_on=['Interactor2.Symbol'], right_on=0)
RNA_hisMod = RNA_hisMod.drop(columns=['Interactor2.Symbol', 0])
RNA_hisMod = RNA_hisMod.rename(columns={1:'Interactor2.Symbol'})

print(set(RNA_hisMod.Category1))

RNA_hisMod.Raw_ID1 = RNA_hisMod.Raw_ID1.str.replace("NCBI:", '')
RNA_hisMod.Raw_ID1 = RNA_hisMod.Raw_ID1.str.replace("miRBase:", '')

RNA_hisMod['Raw_ID1'] = RNA_hisMod['Raw_ID1'].str.split(';')
RNA_hisMod = RNA_hisMod.explode('Raw_ID1')
RNA_hisMod['Interactor1.Symbol'] = RNA_hisMod['Interactor1.Symbol'].str.split('.').str[0]

RNA_hisMod.head(n=3)

In [None]:
RNA_hisMod = pd.merge(RNA_hisMod, rnacentral_map_human_mirbase[['miRBase ID', 'RNAcentral ID']].drop_duplicates(), left_on=['Raw_ID1'],
                   right_on=['miRBase ID'], how="left").drop(columns=["miRBase ID"])

RNA_hisMod['RNAcentral ID'] = RNA_hisMod['RNAcentral ID'].fillna(RNA_hisMod['Raw_ID1'])
RNA_hisMod.drop(columns=['Raw_ID1'],inplace=True)
RNA_hisMod.rename(columns={'RNAcentral ID':'Raw_ID1'},inplace=True)

RNA_hisMod.head(n=3)

In [None]:
i = RNA_hisMod[(RNA_hisMod['Category1']=='mRNA')].index.values
mrna = RNA_hisMod[RNA_hisMod['Category1']=='mRNA']
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ensembl_entrezTranscript_map_mrna = ensembl_entrezTranscript_map[ensembl_entrezTranscript_map[2] == 'protein-coding']
mrna = pd.merge(mrna, ensembl_entrezTranscript_map_mrna, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1',2,3,4,5]).rename(columns={0:'Raw_ID1'})
mrna.drop(columns=['Raw_ID1'],inplace=True)
mrna.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_hisMod.drop(index=i,inplace=True,axis=0)
RNA_hisMod = pd.concat([mrna,RNA_hisMod])

mrna.head(n=3) 

In [None]:
i = RNA_hisMod[(RNA_hisMod['Category1']!='mRNA') & (RNA_hisMod['Category1']!='pseudo')].index.values
ncrna = RNA_hisMod[(RNA_hisMod['Category1']!='mRNA') & (RNA_hisMod['Category1']!='pseudo')]
rnacentral_map_human_hgnc_type = rnacentral_map_hgnc[rnacentral_map_hgnc['Organism'] ==9606]
ncrna = pd.merge(ncrna, rnacentral_map_human_hgnc_type, left_on=['Interactor1.Symbol','Category1'],
                right_on=["HGNC symbol",'RNA category']).drop(columns=["Interactor1.Symbol",'Raw_ID1','RNA category'])
ncrna.rename(columns={"RNAcentral ID":'Raw_ID1'}, inplace=True)
RNA_hisMod.drop(index=i,inplace=True,axis=0)
RNA_hisMod = pd.concat([ncrna,RNA_hisMod])

ncrna.head(n=3)

In [None]:
RNA_hisMod = RNA_hisMod[(RNA_hisMod['Raw_ID1'].str.startswith("URS")) | (RNA_hisMod['Raw_ID1'].str.startswith("ENST")) |
                            (RNA_hisMod['Raw_ID1'].str.startswith("hsa_circ")) | (RNA_hisMod['Raw_ID1'].str[0].str.isdigit())]

RNA_hisMod['Method'] = RNA_hisMod['strong'].astype(str) + '//' + RNA_hisMod['weak'].astype(str) + '//' + RNA_hisMod['predict'].astype(str)
RNA_hisMod['Method'] = RNA_hisMod['Method'].str.lower()
RNA_hisMod['Method'] = RNA_hisMod['Method'].str.replace('nan\/\/','',regex=True)
RNA_hisMod['Method'] = RNA_hisMod['Method'].str.replace('\/\/nan','',regex=True)
RNA_hisMod['Method'] = RNA_hisMod['Method'].replace('nan',np.nan)
RNA_hisMod['Method'] = RNA_hisMod['Method'].str.split('//')
RNA_hisMod = RNA_hisMod.explode('Method')
RNA_hisMod = pd.merge(RNA_hisMod, method_map, right_on='0_y', left_on='Method', how='left')
RNA_hisMod['0_x'] = RNA_hisMod['0_x'].fillna(RNA_hisMod['Method'])
RNA_hisMod = RNA_hisMod.drop(columns=['0_y', 'Method'])
RNA_hisMod = RNA_hisMod.rename(columns={'0_x':'Method'})

RNA_hisMod['Source'] = 'RNAInter'
RNA_hisMod = RNA_hisMod[~RNA_hisMod['Raw_ID1'].str[0].str.isdigit()]
RNA_hisMod.rename(columns={'Raw_ID1':':START_ID', 'Interactor2.Symbol':':END_ID', 'score':'RNAsister_score'},inplace=True)
RNA_hisMod.head(n=3)

* [RNAInter](http://www.rnainter.org/) <br/> RNAInter integrates experimentally validated and computationally predicted RNA interactome data from the literature and databases.

In [None]:
!wget http://www.rnainter.org/raidMedia/download/Download_data_RP.tar.gz -O ../resources/processed_data/unprocessed_data/Download_data_RP.tar.gz

In [None]:
RNA_protein = pd.read_csv(unprocessed_data_location+'Download_data_RP.tar.gz',sep='\t') # PRO

# We select only strong evidence interactions for hsa
RNA_protein = RNA_protein[(RNA_protein['score'] >= 0.2886) & (RNA_protein['Species1'].str.contains('apiens')) &
                          (RNA_protein['Species2'].str.contains('apiens'))]

print(set(RNA_protein.Category2)) # proteins are all in the second column
print(set(RNA_protein.Category1))

RNA_protein.Raw_ID1 = RNA_protein.Raw_ID1.str.replace("NCBI:", '')
RNA_protein.Raw_ID1 = RNA_protein.Raw_ID1.str.replace("miRBase:", '')
RNA_protein.Raw_ID1 = RNA_protein.Raw_ID1.str.replace("circBase:", '')
RNA_protein.Raw_ID1 = RNA_protein.Raw_ID1.str.replace("Ensembl:", '')

RNA_protein['Raw_ID1'] = RNA_protein['Raw_ID1'].str.split(';')
RNA_protein = RNA_protein.explode('Raw_ID1')
RNA_protein['Raw_ID2'] = RNA_protein['Raw_ID2'].str.split(';')
RNA_protein = RNA_protein.explode('Raw_ID2')

entrez_pro_map[0] = entrez_pro_map[0].astype(str)
RNA_protein = pd.merge(RNA_protein, entrez_pro_map.rename(columns={0: 'Raw_ID2'}), on='Raw_ID2', how ='left')
RNA_protein[1] = RNA_protein[1].fillna(RNA_protein['Raw_ID2'])
RNA_protein.drop(columns=['Raw_ID2'],inplace=True)
RNA_protein.rename(columns={1:'Raw_ID2'},inplace=True)
RNA_protein = pd.merge(RNA_protein, symbol_to_pro.rename(columns={0: 'Interactor2.Symbol'}), on='Interactor2.Symbol', how ='left')
RNA_protein[1] = RNA_protein[1].fillna(RNA_protein['Raw_ID2'])
RNA_protein.drop(columns=['Raw_ID2'],inplace=True)
RNA_protein.rename(columns={1:'Raw_ID2'},inplace=True)
RNA_protein.head(n=3)

In [None]:
RNA_protein = pd.merge(RNA_protein, rnacentral_map_human_ensembl[['Ensembl Gene ID', 'RNAcentral ID']].drop_duplicates(), left_on=['Raw_ID1'],
                   right_on=['Ensembl Gene ID'], how="left").drop(columns=["Ensembl Gene ID"])

RNA_protein['RNAcentral ID'] = RNA_protein['RNAcentral ID'].fillna(RNA_protein['Raw_ID1'])
RNA_protein.drop(columns=['Raw_ID1'],inplace=True)
RNA_protein.rename(columns={'RNAcentral ID':'Raw_ID1'},inplace=True)

RNA_protein.head(n=3)

In [None]:
RNA_protein = pd.merge(RNA_protein, rnacentral_map_human_mirbase[['miRBase ID', 'RNAcentral ID']].drop_duplicates(), left_on=['Raw_ID1'],
                   right_on=['miRBase ID'], how="left").drop(columns=["miRBase ID"])

RNA_protein['RNAcentral ID'] = RNA_protein['RNAcentral ID'].fillna(RNA_protein['Raw_ID1'])
RNA_protein.drop(columns=['Raw_ID1'],inplace=True)
RNA_protein.rename(columns={'RNAcentral ID':'Raw_ID1'},inplace=True)

RNA_protein.head(n=3)

In [None]:
print(RNA_protein[RNA_protein['Category1'] == 'tRNA']['Interactor1.Symbol'].unique()[:3]) 
RNA_protein = pd.merge(RNA_protein, rnacentral_map_human_gtrnadb[['RNAcentral ID','GtRNAdb Gene ID']].drop_duplicates().rename(
    columns={'GtRNAdb Gene ID': 'Interactor1.Symbol'}), on='Interactor1.Symbol', how ='left')
RNA_protein['RNAcentral ID'] = RNA_protein['RNAcentral ID'].fillna(RNA_protein['Raw_ID1'])
RNA_protein.drop(columns=['Raw_ID1'],inplace=True)
RNA_protein.rename(columns={'RNAcentral ID':'Raw_ID1'},inplace=True)
RNA_protein.head(n=3)

In [None]:
i = RNA_protein[(RNA_protein['Category1']=='mRNA')].index.values
mrna = RNA_protein[RNA_protein['Category1']=='mRNA']
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ensembl_entrezTranscript_map_mrna = ensembl_entrezTranscript_map[ensembl_entrezTranscript_map[2] == 'protein-coding']
mrna = pd.merge(mrna, ensembl_entrezTranscript_map_mrna, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1',2,3,4,5]).rename(columns={0:'Raw_ID1'})
mrna.drop(columns=['Raw_ID1'],inplace=True)
mrna.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_protein.drop(index=i,inplace=True,axis=0)
RNA_protein = pd.concat([mrna,RNA_protein])

mrna.head(n=3)

In [None]:
i = RNA_protein[(RNA_protein['Category1']!='miRNA') &(RNA_protein['Category1']!='pseudo') & (RNA_protein['Category1']!='mRNA')& (RNA_protein['Category1']!='miRNA') & (RNA_protein['Category1']!='tRNA')& (RNA_protein['Category1']!='rRNA')].index.values
ncrna = RNA_protein[(RNA_protein['Category1']!='miRNA') &(RNA_protein['Category1']!='pseudo') & (RNA_protein['Category1']!='mRNA')& (RNA_protein['Category1']!='miRNA') & (RNA_protein['Category1']!='tRNA')& (RNA_protein['Category1']!='rRNA')]
rnacentral_map_human_hgnc_type = rnacentral_map_hgnc[rnacentral_map_hgnc['Organism'] ==9606]
ncrna = pd.merge(ncrna, rnacentral_map_human_hgnc_type, left_on=['Interactor1.Symbol','Category1'],
                right_on=["HGNC symbol",'RNA category']).drop(columns=["Interactor1.Symbol",'Raw_ID1','RNA category'])
ncrna.rename(columns={"RNAcentral ID":'Raw_ID1'}, inplace=True)
RNA_protein.drop(index=i,inplace=True,axis=0)
RNA_protein = pd.concat([ncrna,RNA_protein])

ncrna.head(n=3)

In [None]:
# Start mislabeled data

In [None]:
RNA_RNA = pd.read_csv(unprocessed_data_location+'Download_data_RR.tar.gz',sep='\t').rename(columns={'Download_data_RR.txt':'RNAInterID'})

# We select only strong evidence interactions for hsa
RNA_RNA = RNA_RNA[(RNA_RNA['score'] >= 0.2886) &
                  (RNA_RNA['Species1'].str.contains('apiens')) &
                  (RNA_RNA['Species2'].str.contains('apiens'))]

RNA_RNA.Category1 = RNA_RNA.Category1.str.replace("PCG", 'mRNA')
RNA_RNA.Category2 = RNA_RNA.Category2.str.replace("PCG", 'mRNA')

RNA_RNA.Raw_ID1 = RNA_RNA.Raw_ID1.str.replace("NCBI:", '')
RNA_RNA.Raw_ID2 = RNA_RNA.Raw_ID2.str.replace("NCBI:", '')

RNA_RNA.Raw_ID1 = RNA_RNA.Raw_ID1.str.replace("miRBase:", '')
RNA_RNA.Raw_ID2 = RNA_RNA.Raw_ID2.str.replace("miRBase:", '')

RNA_RNA.Raw_ID1 = RNA_RNA.Raw_ID1.str.replace("circBase:", '')
RNA_RNA.Raw_ID2 = RNA_RNA.Raw_ID2.str.replace("circBase:", '')

# tRF
RNA_RNA.Raw_ID1 = RNA_RNA.Raw_ID1.str.replace("tRFdb:", 'trfdb?')
RNA_RNA.Raw_ID2 = RNA_RNA.Raw_ID2.str.replace("tRFdb:", 'trfdb?')

RNA_RNA['Raw_ID1'] = RNA_RNA['Raw_ID1'].str.split(';')
RNA_RNA = RNA_RNA.explode('Raw_ID1')
RNA_RNA['Raw_ID2'] = RNA_RNA['Raw_ID2'].str.split(';')
RNA_RNA = RNA_RNA.explode('Raw_ID2')

RNA_RNA['Interactor1.Symbol'] = RNA_RNA['Interactor1.Symbol'].str.split('.').str[0]
RNA_RNA['Interactor2.Symbol'] = RNA_RNA['Interactor2.Symbol'].str.split('.').str[0]

RNA_RNA.head(n=3)

In [None]:
a = set(RNA_RNA.Category1)
b = set(RNA_RNA.Category2)
a.union(b)

In [None]:
RNA_RNA = pd.merge(RNA_RNA, rnacentral_map_human_mirbase[['miRBase ID', 'RNAcentral ID']].drop_duplicates(), left_on=['Raw_ID1'],
                   right_on=['miRBase ID'], how="left").drop(columns=["miRBase ID"])

RNA_RNA['RNAcentral ID'] = RNA_RNA['RNAcentral ID'].fillna(RNA_RNA['Raw_ID1'])
RNA_RNA.drop(columns=['Raw_ID1'],inplace=True)
RNA_RNA.rename(columns={'RNAcentral ID':'Raw_ID1'},inplace=True)

RNA_RNA = pd.merge(RNA_RNA, rnacentral_map_human_mirbase[['miRBase ID', 'RNAcentral ID']].drop_duplicates(), left_on=['Raw_ID2'],
                   right_on=['miRBase ID'], how="left").drop(columns=["miRBase ID"])

RNA_RNA['RNAcentral ID'] = RNA_RNA['RNAcentral ID'].fillna(RNA_RNA['Raw_ID2'])
RNA_RNA.drop(columns=['Raw_ID2'],inplace=True)
RNA_RNA.rename(columns={'RNAcentral ID':'Raw_ID2'},inplace=True)

RNA_RNA.head(n=2)

In [None]:
print(RNA_RNA[RNA_RNA['Category2'] == 'piRNA'])

In [None]:
print(RNA_RNA[RNA_RNA['Category2'] == 'piRNA'])
RNA_RNA.loc[31571, 'Raw_ID2'] = 'piR-hsa-39980'
RNA_RNA.loc[39194, 'Raw_ID2'] = 'piR-hsa-20280'

In [None]:
print(RNA_RNA[RNA_RNA['Category1'] == 'tRNA'])

In [None]:
print(RNA_RNA[RNA_RNA['Category1'] == 'tRNA'])
RNA_RNA.loc[23191, 'Raw_ID1'] = 'URS0000287398'
RNA_RNA.loc[23192, 'Raw_ID1'] = 'URS00003C9A26'

In [None]:
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("transcribed_unitary_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("transcribed_unprocessed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("transcribed_processed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("unprocessed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("processed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("IG_V_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("unitary_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("TR_J_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("TR_V_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("IG_C_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("IG_J_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("translated_processed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("IG_pseudogene","pseudo")

In [None]:
i = RNA_RNA[(RNA_RNA['Category1']=='pseudo')].index.values
pseudo = RNA_RNA[RNA_RNA['Category1']=='pseudo']
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ensembl_entrezTranscript_map_pseudo = ensembl_entrezTranscript_map[ensembl_entrezTranscript_map[2] == 'pseudo']
pseudo = pd.merge(pseudo, ensembl_entrezTranscript_map_pseudo, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1',2,3,4,5]).rename(columns={0:'Raw_ID1'})
pseudo.drop(columns=['Raw_ID1'],inplace=True)
pseudo.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([pseudo,RNA_RNA])

i = RNA_RNA[RNA_RNA['Category2']=='pseudo'].index.values
pseudo = RNA_RNA[RNA_RNA['Category2']=='pseudo']
pseudo = pd.merge(pseudo, ensembl_entrezTranscript_map_pseudo, left_on=['Raw_ID2'],
                right_on=[0]).drop(columns=['Raw_ID2',2,3,4,5]).rename(columns={0:'Raw_ID2'})
pseudo.drop(columns=['Raw_ID2'],inplace=True)
pseudo.rename(columns={1:'Raw_ID2'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([pseudo,RNA_RNA])

pseudo.head(n=3)

In [None]:
i = RNA_RNA[(RNA_RNA['Category1']=='mRNA')].index.values
mrna = RNA_RNA[RNA_RNA['Category1']=='mRNA']
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ensembl_entrezTranscript_map_mrna = ensembl_entrezTranscript_map[ensembl_entrezTranscript_map[2] == 'protein-coding']
mrna = pd.merge(mrna, ensembl_entrezTranscript_map_mrna, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1',2,3,4,5]).rename(columns={0:'Raw_ID1'})
mrna.drop(columns=['Raw_ID1'],inplace=True)
mrna.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([mrna,RNA_RNA])

i = RNA_RNA[RNA_RNA['Category2']=='mRNA'].index.values
mrna = RNA_RNA[RNA_RNA['Category2']=='mRNA']
mrna = pd.merge(mrna, ensembl_entrezTranscript_map_mrna, left_on=['Raw_ID2'],
                right_on=[0]).drop(columns=['Raw_ID2',2,3,4,5]).rename(columns={0:'Raw_ID2'})
mrna.drop(columns=['Raw_ID2'],inplace=True)
mrna.rename(columns={1:'Raw_ID2'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([mrna,RNA_RNA])

mrna.head(n=3)

In [None]:
i = RNA_RNA[(RNA_RNA['Category1']=='protein') | (RNA_RNA['Category1']=='TF')].index.values
protein = RNA_RNA[(RNA_RNA['Category1']=='protein') | (RNA_RNA['Category1']=='TF')]
entrez_pro_map[0] = entrez_pro_map[0].astype(str)
protein = pd.merge(protein, entrez_pro_map, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1']).rename(columns={0:'Raw_ID1'})
protein.drop(columns=['Raw_ID1'],inplace=True)
protein.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([protein,RNA_RNA])

i = RNA_RNA[(RNA_RNA['Category2']=='protein') | (RNA_RNA['Category2']=='TF')].index.values
protein = RNA_RNA[(RNA_RNA['Category2']=='protein') | (RNA_RNA['Category2']=='TF')]
protein = pd.merge(protein, entrez_pro_map, left_on=['Raw_ID2'],
                right_on=[0]).drop(columns=['Raw_ID2']).rename(columns={0:'Raw_ID2'})
protein.drop(columns=['Raw_ID2'],inplace=True)
protein.rename(columns={1:'Raw_ID2'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([protein,RNA_RNA])

protein.head(n=3)

In [None]:
i = RNA_RNA[(RNA_RNA['Category1']!='protein') & (RNA_RNA['Category1']!='TF') & (RNA_RNA['Category1']!='mRNA') & 
            (RNA_RNA['Category1']!='pseudo') & (RNA_RNA['Category1']!='piRNA') & (RNA_RNA['Category1']!='miRNA')
            & (RNA_RNA['Category1']!='eRNA') & (RNA_RNA['Category1']!='circRNA') & (RNA_RNA['Category1']!='tRF')].index.values
ncrna = RNA_RNA[(RNA_RNA['Category1']!='protein') & (RNA_RNA['Category1']!='TF') & (RNA_RNA['Category1']!='mRNA')
            & (RNA_RNA['Category1']!='pseudo') & (RNA_RNA['Category1']!='piRNA') & (RNA_RNA['Category1']!='miRNA')
            & (RNA_RNA['Category1']!='eRNA') & (RNA_RNA['Category1']!='circRNA') & (RNA_RNA['Category1']!='tRF')]
rnacentral_map_human_hgnc_type = rnacentral_map_hgnc[rnacentral_map_hgnc['Organism'] ==9606]
ncrna = pd.merge(ncrna, rnacentral_map_human_hgnc_type, left_on=['Interactor1.Symbol','Category1'],
                right_on=["HGNC symbol",'RNA category']).drop(columns=["Interactor1.Symbol",'Raw_ID1','RNA category'])
ncrna.rename(columns={"RNAcentral ID":'Raw_ID1'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([ncrna,RNA_RNA])

i = RNA_RNA[(RNA_RNA['Category2']!='protein') & (RNA_RNA['Category2']!='TF') & (RNA_RNA['Category2']!='mRNA')
            & (RNA_RNA['Category2']!='pseudo') & (RNA_RNA['Category2']!='piRNA') & (RNA_RNA['Category2']!='miRNA')
            & (RNA_RNA['Category2']!='eRNA') & (RNA_RNA['Category2']!='circRNA') & (RNA_RNA['Category2']!='tRF')].index.values
ncrna = RNA_RNA[(RNA_RNA['Category2']!='protein') & (RNA_RNA['Category2']!='TF') & (RNA_RNA['Category2']!='mRNA')
            & (RNA_RNA['Category2']!='pseudo') & (RNA_RNA['Category2']!='piRNA') & (RNA_RNA['Category2']!='miRNA')
            & (RNA_RNA['Category2']!='eRNA') & (RNA_RNA['Category2']!='circRNA') & (RNA_RNA['Category2']!='tRF')]
ncrna = pd.merge(ncrna, rnacentral_map_human_hgnc_type, left_on=['Interactor2.Symbol','Category2'],
                right_on=["HGNC symbol",'RNA category']).drop(columns=["Interactor2.Symbol",'Raw_ID2','RNA category'])
ncrna.rename(columns={"RNAcentral ID":'Raw_ID2'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([ncrna,RNA_RNA])
RNA_RNA = RNA_RNA[['Raw_ID1','Raw_ID2','Category1','Category2','score','strong','weak','predict']]

ncrna.head(n=3)

In [None]:
a = set(RNA_RNA.Category1)
b = set(RNA_RNA.Category2)
a.union(b)

In [None]:
# Mislabeled data
RNA_protein_ = RNA_RNA[(RNA_RNA['Raw_ID1'].str.startswith("PR")) | (RNA_RNA['Raw_ID2'].str.startswith("PR"))]

In [None]:
# End mislabeled data

In [None]:
RNA_protein = RNA_protein[(RNA_protein['Raw_ID1'].str.startswith("URS")) | (RNA_protein['Raw_ID1'].str.startswith("ENST")) |
                          (RNA_protein['Raw_ID1'].str.startswith("hsa_circ")) | RNA_protein['Raw_ID1'].str[0].str.isdigit()]

RNA_protein = pd.concat([RNA_protein_, RNA_protein])

RNA_protein['Method'] = RNA_protein['strong'].astype(str) + '//' + RNA_protein['weak'].astype(str) + '//' + RNA_protein['predict'].astype(str)
RNA_protein['Method'] = RNA_protein['Method'].str.lower()
RNA_protein['Method'] = RNA_protein['Method'].str.replace('nan\/\/','',regex=True)
RNA_protein['Method'] = RNA_protein['Method'].str.replace('\/\/nan','',regex=True)
RNA_protein['Method'] = RNA_protein['Method'].replace('nan',np.nan)
RNA_protein['Method'] = RNA_protein['Method'].str.split('//')
RNA_protein = RNA_protein.explode('Method')
RNA_protein = pd.merge(RNA_protein, method_map, right_on='0_y', left_on='Method', how='left')
RNA_protein['0_x'] = RNA_protein['0_x'].fillna(RNA_protein['Method'])
RNA_protein = RNA_protein.drop(columns=['0_y', 'Method'])
RNA_protein = RNA_protein.rename(columns={'0_x':'Method'})

RNA_protein['Source'] = 'RNAInter'
RNA_protein = RNA_protein.rename(columns={'Raw_ID1':':START_ID', 'Raw_ID2':':END_ID', 'score':'RNAsister_score'})
gene_protein = RNA_protein[RNA_protein[':START_ID'].str[0].str.isdigit()]
RNA_protein = RNA_protein[~RNA_protein[':START_ID'].str[0].str.isdigit()]
print(gene_protein.head(n=3))
RNA_protein.head(n=3)

* [miRNet](https://www.mirnet.ca/miRNet/)

In [None]:
!wget https://www.dropbox.com/s/abaeonmjpftbspx/miRNet-mir-mol-hsa.csv?dl=0 -O ../resources/processed_data/unprocessed_data/miRNet-mir-mol-hsa.csv?dl=0

In [None]:
miRNA_chemical2 = pd.read_csv(unprocessed_data_location + 'miRNet-mir-mol-hsa.csv?dl=0') # ChEBI+DrugBank
miRNA_chemical2.molecule=miRNA_chemical2.molecule.str.lower()
miRNA_chemical2=miRNA_chemical2.drop(columns=['mirnet','mir_id','pubchem_id'])
miRNA_chemical2['mir_acc'] = miRNA_chemical2['mir_acc'].str.strip()

print(all(miRNA_chemical2['mir_acc'].isin(rnacentral_map_human_mirbase['miRBase ID'])))
# These are all entries belonging to miRBase but not directly to RNAcentral --> use rnacentral_map_human
print(miRNA_chemical2[~miRNA_chemical2['mir_acc'].isin(rnacentral_map_human_mirbase['miRBase ID'])]['mir_acc'].unique()[:3])
miRNA_chemical2 = pd.merge(miRNA_chemical2, rnacentral_map_human.rename(columns={'DB ID':'mir_acc'}), on='mir_acc').drop(
    columns=['mir_acc','DB','Organism','RNA category','DB Description']).rename(columns={'RNAcentral ID':'RNA'})

miRNA_chemical2 = pd.merge(miRNA_chemical2, desc_chebi_map, left_on=['molecule'], right_on=[0], how='left').drop(
    columns=[0,'molecule']).rename(columns={1:'Chemical'})
miRNA_chemical2['Chemical'].fillna(miRNA_chemical2['drug_bank'], inplace=True)
miRNA_chemical2 = miRNA_chemical2.drop(columns=['drug_bank'])
miRNA_chemical2 = miRNA_chemical2[miRNA_chemical2['Chemical'].notna()]

miRNA_chemical2 = pd.merge(miRNA_chemical2, method_map, right_on='0_y', left_on='method', how='left')
miRNA_chemical2['0_x'] = miRNA_chemical2['0_x'].fillna(miRNA_chemical2['method'])
miRNA_chemical2 = miRNA_chemical2.drop(columns=['0_y', 'method'])
miRNA_chemical2 = miRNA_chemical2.rename(columns={'0_x':'Method'})

miRNA_chemical2 = pd.merge(miRNA_chemical2, disease_map, right_on='0_y', left_on='condition', how='left')
miRNA_chemical2['0_x'] = miRNA_chemical2['0_x'].fillna(miRNA_chemical2['condition'])
miRNA_chemical2 = miRNA_chemical2.drop(columns=['0_y', 'condition'])
miRNA_chemical2 = miRNA_chemical2.rename(columns={'0_x':'Location'})

miRNA_chemical2['pmid'] = pd.to_numeric(miRNA_chemical2['pmid'], errors='coerce')
miRNA_chemical2['pmid'] = miRNA_chemical2['pmid'].astype(str)
miRNA_chemical2['pmid'] = miRNA_chemical2['pmid'].str.replace(".0", "")
miRNA_chemical2['pmid'] = miRNA_chemical2['pmid'].replace("nan", np.nan)

print(miRNA_chemical2['expression'].unique())
miRNA_chemical2['Source'] = 'miRNet'
miRNA_chemical2.rename(columns={'RNA':':START_ID','Chemical':':END_ID','pmid':'PubMedID'},inplace=True)
miRNA_chemical2.head(n=3)

* [miRandola](http://mirandola.iit.cnr.it/index.php)

In [7]:
drug_list=['aspirin','bevacizumab','clopidogrel',
           'conventional%20synthetic%20disease-modifying%20antirheumatic%20drugs%20(cs-dmards)',
           'docetaxel', 'epirubicin%20plus%20paclitaxel','fluorouracil%20(5-fu)','gemcitabine',
           'hypomethylating%20agents%20(hmas)','lapatinib','lithium','mercury','n-acetyl%20cysteine%20(nac)',
           'paracetamol','platinum','praziquantel%20(pzq)','sorafenib','testosterone',
           'transarterial%20chemoembolization%20(tace)','trastuzumab','xuezhikang'
          ]
miRNA_chemical_mirandola=[] # ChEBI
for drug in drug_list:
    miRNA_chemical = pd.read_html('http://mirandola.iit.cnr.it/view_drug.php?LV='+drug)
    for miRNA in range(len(miRNA_chemical)):
        miRNA_chemical = pd.read_html('http://mirandola.iit.cnr.it/view_drug.php?LV='+drug,header=0)[miRNA]
        miRNA_chemical = miRNA_chemical.T
        miRNA_chemical.columns = miRNA_chemical.iloc[0]
        miRNA_chemical = miRNA_chemical.drop(index=miRNA_chemical.iloc[0].name)
        miRNA_chemical = miRNA_chemical.drop(index=miRNA_chemical.iloc[1].name)
        miRNA_chemical = miRNA_chemical.drop(miRNA_chemical.iloc[:, :16],axis = 1)
        miRNA_chemical_mirandola.append(miRNA_chemical) 

miRNA_chemical_mirandola = pd.concat(miRNA_chemical_mirandola)
print(miRNA_chemical_mirandola['Data imported from external databases?'].unique())
miRNA_chemical_mirandola = miRNA_chemical_mirandola[miRNA_chemical_mirandola['miRBase Accession'].notna()]
miRNA_chemical_mirandola=miRNA_chemical_mirandola.drop(columns=['RNA from literature','RNA class', 'miRBase ID','Experiment Description/Results',
                                                                'miRBase family', 'Organism','First Author','Journal',
                                                                'Title','Year of publication','Data imported from external databases?'])
miRNA_chemical_mirandola.head(n=3)     

['No']


Unnamed: 0,miRBase Accession,Potential biomarker role defined in the literature,exRNA form,Sample,Sample source,"Diseases, Cell Lines or normal status",Expression,Drug,PubMed ID,Methods
Unnamed: 1,MIMAT0000445,yes,circulating,plasma,-,type 2 diabetes,down,aspirin,23386708,In vitro and in vivo platelet activation
Unnamed: 1,MIMAT0000092,unknown,circulating,plasma,-,aspirin resistance,-,aspirin,27208561,Rt-qpcr
Unnamed: 1,MIMAT0000445,yes,circulating,plasma,-,metastatic colorectal cancer (mcrc),up,bevacizumab,25584492,Qrt-pcr


In [8]:
print(all(miRNA_chemical_mirandola['miRBase Accession'].isin(rnacentral_map_human_mirbase['miRBase ID'])))
print(miRNA_chemical_mirandola[~miRNA_chemical_mirandola['miRBase Accession'].isin(rnacentral_map_human_mirbase[
    'miRBase ID'])]['miRBase Accession'].unique()[:3])
# MIMAT0001818 is a dre sequence, not a human one
miRNA_chemical_mirandola = pd.merge(miRNA_chemical_mirandola, rnacentral_map_human_mirbase.rename(columns={'miRBase ID':'miRBase Accession'}),
                                    on='miRBase Accession', how='left').rename(columns={'RNAcentral ID':'RNA'})
miRNA_chemical_mirandola['miRBase Accession'] = miRNA_chemical_mirandola['miRBase Accession'].str.replace('MIMAT0005905', 'URS000047047A') 
miRNA_chemical_mirandola['RNA'] = miRNA_chemical_mirandola['RNA'].fillna(miRNA_chemical_mirandola['miRBase Accession'])
miRNA_chemical_mirandola = miRNA_chemical_mirandola.drop(columns=['miRBase Accession'])
miRNA_chemical_mirandola = miRNA_chemical_mirandola[miRNA_chemical_mirandola['RNA'] != 'MIMAT0001818']
miRNA_chemical_mirandola['Drug'] = miRNA_chemical_mirandola['Drug'].str.strip()
miRNA_chemical_mirandola = pd.merge(miRNA_chemical_mirandola, desc_chebi_map, left_on=['Drug'], right_on=[0]).drop(
    columns=[0]).rename(columns={1:'Chemical'})

miRNA_chemical_mirandola['Methods'] = miRNA_chemical_mirandola['Methods'].str.lower()
miRNA_chemical_mirandola = pd.merge(miRNA_chemical_mirandola, method_map, right_on='0_y', left_on='Methods', how='left')
miRNA_chemical_mirandola['0_x'] = miRNA_chemical_mirandola['0_x'].fillna(miRNA_chemical_mirandola['Methods'])
miRNA_chemical_mirandola = miRNA_chemical_mirandola.drop(columns=['0_y', 'Methods'])
miRNA_chemical_mirandola = miRNA_chemical_mirandola.rename(columns={'0_x':'Method'})

miRNA_chemical_mirandola['Diseases, Cell Lines or normal status'] = miRNA_chemical_mirandola['Diseases, Cell Lines or normal status'].str.lower()
miRNA_chemical_mirandola = pd.merge(miRNA_chemical_mirandola, disease_map, right_on='0_y', left_on='Diseases, Cell Lines or normal status', how='left')
miRNA_chemical_mirandola['0_x'] = miRNA_chemical_mirandola['0_x'].fillna(miRNA_chemical_mirandola['Diseases, Cell Lines or normal status'])
miRNA_chemical_mirandola = miRNA_chemical_mirandola.drop(columns=['0_y', 'Diseases, Cell Lines or normal status'])
miRNA_chemical_mirandola = miRNA_chemical_mirandola.rename(columns={'0_x':'Location'})

miRNA_chemical_mirandola['PubMed ID'] = pd.to_numeric(miRNA_chemical_mirandola['PubMed ID'], errors='coerce')
miRNA_chemical_mirandola['PubMed ID'] = miRNA_chemical_mirandola['PubMed ID'].astype(str)
miRNA_chemical_mirandola['PubMed ID'] = miRNA_chemical_mirandola['PubMed ID'].str.replace(".0", "")
miRNA_chemical_mirandola['PubMed ID'] = miRNA_chemical_mirandola['PubMed ID'].replace("nan", np.nan)

miRNA_chemical['Source'] = 'miRNet, SM2miR'
miRNA_chemical['Source'] = miRNA_chemical['Source'].str.split(", ")
miRNA_chemical = miRNA_chemical.explode('Source')
miRNA_chemical_mirandola['Source'] = 'miRandola'
miRNA_chemical_mirandola.rename(columns={'RNA':':START_ID','Chemical':':END_ID','PubMed ID':'PubMedID'},inplace=True)
miRNA_chemical_mirandola.head(n=3)

False
['MIMAT0005905' 'MIMAT0001818']


Unnamed: 0,Potential biomarker role defined in the literature,exRNA form,Sample,Sample source,Expression,Drug,PubMedID,:START_ID,:END_ID,Method,Location,Source
0,yes,circulating,plasma,-,down,aspirin,23386708,URS00001F1DA8,CHEBI_15365,in vitro and in vivo platelet activation,type 2 diabetes mellitus,miRandola
1,unknown,circulating,plasma,-,-,aspirin,27208561,URS00003768C5,CHEBI_15365,rt-qpcr,aspirin resistance,miRandola
2,yes,circulating,plasma,-,up,bevacizumab,25584492,URS00001F1DA8,DB00112,quantitative reverse transcriptase pcr,metastatic colorectal cancer (mcrc),miRandola


In [9]:
miRNA_chemical_mirandola_intwith = miRNA_chemical_mirandola[miRNA_chemical_mirandola['Expression']=='-'].drop(
    columns=['Expression','Potential biomarker role defined in the literature','exRNA form','Sample','Sample source', 'Drug'])
miRNA_chemical_mirandola_intwith.head(n=3)

Unnamed: 0,PubMedID,:START_ID,:END_ID,Method,Location,Source
1,27208561,URS00003768C5,CHEBI_15365,rt-qpcr,aspirin resistance,miRandola


* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv') # TF (PRO) 
lncRNA_protein2 = LncRNAWiki[LncRNAWiki['target_type'].notna()]
# We don't care about miRNA wrongly labeled as TF as they will be discarded when terms will be mapped on PRO
lncRNA_protein2 = pd.concat([lncRNA_protein2[lncRNA_protein2.target_type.str.contains('TF')],
    lncRNA_protein2[lncRNA_protein2.target_type.str.contains('protein')]])
lncRNA_protein2.drop(columns=['symbol','synonyms','gene_locus','gene_id','conservation_ortholog','modification_detail','biological_process',
                            'conservation_species','target_interaction','description','conservation',
                            'biological_context', 'genome_variation','variation_detail','epigenetic_modification',
                            'expression','regulator_type','regulator_interaction','target_type','molecular_function',
                            'regulator','regulator_effect','target_effect','functional_mechanism','clinical_detail','expression_detail'
                            ],inplace=True)
lncRNA_protein2['transcript_id'] = lncRNA_protein2['transcript_id'].str.split(',')
lncRNA_protein2 = lncRNA_protein2.explode('transcript_id')
lncRNA_protein2 = lncRNA_protein2[lncRNA_protein2['transcript_id'].notna()]
lncRNA_protein2 = pd.merge(lncRNA_protein2, rnacentral_map_human_lncbook[['RNAcentral ID', 'LncBook Transcript ID']].drop_duplicates().rename(
    columns={'LncBook Transcript ID':'transcript_id'}), on = 'transcript_id').drop(columns=['transcript_id']).rename(
        columns={'RNAcentral ID':'RNA'})
lncRNA_protein2 = pd.merge(lncRNA_protein2, symbol_to_pro.rename(columns={0:'target'}), on='target').drop(columns=['target']).rename(columns={
        1:'TF'})

lncRNA_protein2['pmid'] = pd.to_numeric(lncRNA_protein2['pmid'], errors='coerce')
lncRNA_protein2['pmid'] = lncRNA_protein2['pmid'].astype(str)
lncRNA_protein2['pmid'] = lncRNA_protein2['pmid'].str.replace(".0", "")
lncRNA_protein2['pmid'] = lncRNA_protein2['pmid'].replace("<NA>", np.nan)

lncRNA_protein2['drug'] = lncRNA_protein2['drug'].str.lower().str.split(";")
lncRNA_protein2 = lncRNA_protein2.explode('drug')

lncRNA_protein2['experimental_method'] = lncRNA_protein2['experimental_method'].str.lower().str.split(";")
lncRNA_protein2 = lncRNA_protein2.explode('experimental_method')
lncRNA_protein2 = pd.merge(lncRNA_protein2, method_map, right_on='0_y', left_on='experimental_method', how='left')
lncRNA_protein2['0_x'] = lncRNA_protein2['0_x'].fillna(lncRNA_protein2['experimental_method'])
lncRNA_protein2 = lncRNA_protein2.drop(columns=['0_y', 'experimental_method'])
lncRNA_protein2 = lncRNA_protein2.rename(columns={'0_x':'Method','pmid':'PubMedID', 'drug':'Drug', 'regulator':'Regulator', 'target':'Interactor'})

lncRNA_protein2['tissue/cell line'] = lncRNA_protein2['tissue/cell line'].str.lower().str.split(";")
lncRNA_protein2 = lncRNA_protein2.explode('tissue/cell line')
lncRNA_protein2 = pd.merge(lncRNA_protein2, location_map, right_on='0_y', left_on='tissue/cell line', how='left')
lncRNA_protein2['0_x'] = lncRNA_protein2['0_x'].fillna(lncRNA_protein2['tissue/cell line'])
lncRNA_protein2 = lncRNA_protein2.drop(columns=['0_y', 'tissue/cell line'])
lncRNA_protein2 = lncRNA_protein2.rename(columns={'0_x':'Location'})

lncRNA_protein2['context_detail'] = lncRNA_protein2['context_detail'].str.lower().str.split(";")
lncRNA_protein2 = lncRNA_protein2.explode('context_detail')
lncRNA_protein2 = pd.merge(lncRNA_protein2, disease_map, right_on='0_y', left_on='context_detail', how='left')
lncRNA_protein2['0_x'] = lncRNA_protein2['0_x'].fillna(lncRNA_protein2['context_detail'])
lncRNA_protein2 = lncRNA_protein2.drop(columns=['0_y', 'context_detail'])
lncRNA_protein2 = lncRNA_protein2.rename(columns={'0_x':'Location2'})

lncRNA_protein2 = pd.concat([lncRNA_protein2.drop(columns=['Location2']),
                             lncRNA_protein2.drop(columns=['Location']).rename(columns={'Location2':'Location'})])

lncRNA_protein2['Source'] = 'LncRNAWiki'
lncRNA_protein2.rename(columns={'RNA':':START_ID','TF':':END_ID'},inplace=True)
lncRNA_protein2.head(n=3)

* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv') # ChEBI+DrugBank
lncRNA_chemical = LncRNAWiki[LncRNAWiki['drug'].notna()]
lncRNA_chemical = lncRNA_chemical.drop(columns=['symbol','synonyms','gene_locus','gene_id','conservation_ortholog','clinical_detail',
                                        'modification_detail','target_interaction', 'conservation_species','target_effect',
                                        'description','conservation','target_type','biological_context','regulator_effect',
                                        'regulator_interaction', 'genome_variation', 'variation_detail', 'molecular_function',
                                        'expression','regulator_type','functional_mechanism'])
lncRNA_chemical = lncRNA_chemical.dropna(axis=1, how='all')

lncRNA_chemical['transcript_id'] = lncRNA_chemical['transcript_id'].str.split(',')
lncRNA_chemical = lncRNA_chemical.explode('transcript_id')
lncRNA_chemical = lncRNA_chemical[lncRNA_chemical['transcript_id'].notna()]

lncRNA_chemical['drug'] = lncRNA_chemical['drug'].str.split(',')
lncRNA_chemical = lncRNA_chemical.explode('drug')
lncRNA_chemical = lncRNA_chemical[lncRNA_chemical['drug'].notna()]

lncRNA_chemical['pmid'] = pd.to_numeric(lncRNA_chemical['pmid'], errors='coerce')
lncRNA_chemical['pmid'] = lncRNA_chemical['pmid'].astype(str)
lncRNA_chemical['pmid'] = lncRNA_chemical['pmid'].str.replace(".0", "")
lncRNA_chemical['pmid'] = lncRNA_chemical['pmid'].replace("<NA>", np.nan)

lncRNA_chemical['experimental_method'] = lncRNA_chemical['experimental_method'].str.lower().str.split(";")
lncRNA_chemical = lncRNA_chemical.explode('experimental_method')
lncRNA_chemical = pd.merge(lncRNA_chemical, method_map, right_on='0_y', left_on='experimental_method', how='left')
lncRNA_chemical['0_x'] = lncRNA_chemical['0_x'].fillna(lncRNA_chemical['experimental_method'])
lncRNA_chemical = lncRNA_chemical.drop(columns=['0_y', 'experimental_method'])
lncRNA_chemical = lncRNA_chemical.rename(columns={'0_x':'Method','pmid':'PubMedID', 'regulator':'Regulator', 'target':'Interactor'})

lncRNA_chemical['tissue/cell line'] = lncRNA_chemical['tissue/cell line'].str.lower().str.split(";")
lncRNA_chemical = lncRNA_chemical.explode('tissue/cell line')
lncRNA_chemical = pd.merge(lncRNA_chemical, location_map, right_on='0_y', left_on='tissue/cell line', how='left')
lncRNA_chemical['0_x'] = lncRNA_chemical['0_x'].fillna(lncRNA_chemical['tissue/cell line'])
lncRNA_chemical = lncRNA_chemical.drop(columns=['0_y', 'tissue/cell line'])
lncRNA_chemical = lncRNA_chemical.rename(columns={'0_x':'Location'})

lncRNA_chemical['context_detail'] = lncRNA_chemical['context_detail'].str.lower().str.split(";")
lncRNA_chemical = lncRNA_chemical.explode('context_detail')
lncRNA_chemical = pd.merge(lncRNA_chemical, disease_map, right_on='0_y', left_on='context_detail', how='left')
lncRNA_chemical['0_x'] = lncRNA_chemical['0_x'].fillna(lncRNA_chemical['context_detail'])
lncRNA_chemical = lncRNA_chemical.drop(columns=['0_y', 'context_detail'])
lncRNA_chemical = lncRNA_chemical.rename(columns={'0_x':'Location2'})

lncRNA_chemical = pd.concat([lncRNA_chemical.drop(columns=['Location2']),
                             lncRNA_chemical.drop(columns=['Location']).rename(columns={'Location2':'Location'})])

lncRNA_chemical = pd.merge(lncRNA_chemical, rnacentral_map_human_lncbook[['LncBook Transcript ID', 'RNAcentral ID']].drop_duplicates().rename(
    columns={'LncBook Transcript ID':'transcript_id'}), on = 'transcript_id').drop(columns=['transcript_id']).rename(
        columns={'RNAcentral ID':'RNA'})
lncRNA_chemical['Source'] = 'LncRNAWiki'
lncRNA_chemical_chebi = pd.merge(desc_chebi_map.rename(columns={0:'drug'}), lncRNA_chemical, on=['drug']).drop(
    columns=['drug']).rename(columns={1:'Drug'})

lncRNA_chemical_chebi = lncRNA_chemical_chebi.rename(columns={'Drug':':END_ID','RNA':':START_ID'})
lncRNA_chemical_chebi.head(n=3)

In [None]:
lncRNA_chemical_chebi_intwith = lncRNA_chemical_chebi[
    lncRNA_chemical_chebi['expression_detail'] == 'Differentially expressed'].drop(columns=['expression_detail'])
lncRNA_chemical_chebi_intwith.head(n=3)

In [None]:
RNA_interacts_with_OBO = pd.concat([lncRNA_chemical_chebi_intwith, lncRNA_protein2, miRNA_chemical_mirandola_intwith,
                                    miRNA_chemical2, RNA_protein, RNA_hisMod, RNA_chemical, ViRBase])
RNA_interacts_with_OBO = RNA_interacts_with_OBO.groupby([':START_ID',':END_ID']).agg({'Regulator':set, 'Interactor':set,
                                                                                      'PubMedID':set, 'Method':set, 'Location':set,
                                                                                      'Source':set, 'Drug':set,
                                                                                      'RNAsister_score':np.mean}).reset_index()
RNA_interacts_with_OBO[":TYPE"] = "interacts_with"
RNA_interacts_with_OBO.to_pickle(unprocessed_edge_data_location+'RNA_interacts_with_OBO.pkl')
OBO_interacts_with_RNA = RNA_interacts_with_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_interacts_with_RNA.to_pickle(unprocessed_edge_data_location+'OBO_interacts_with_RNA.pkl')
OBO_interacts_with_RNA.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0011024 (indirectly positively regulates quantity of) - RNA

* [SM2miR](http://www.jianglab.cn/SM2miR/)

In [None]:
miRNA_chemical = pd.read_excel(unprocessed_data_location + 'SM2miR3.xls') # ChEBI+DrugBank
miRNA_chemical = miRNA_chemical[miRNA_chemical['Species'].str.contains('sapiens')]
miRNA_chemical = miRNA_chemical.drop(columns=['miRNA','FDA','CID','Species','Year','Reference','Support'])
miRNA_chemical = miRNA_chemical[(~miRNA_chemical['miRBase'].isna())]

miRNA_chemical['small melocule'] = miRNA_chemical['small melocule'].str.lower().str.replace("\(.*?\)| \(.*?\)", '').str.rstrip()
miRNA_chemical['small melocule'] = miRNA_chemical['small melocule'].str.split('+')
miRNA_chemical = miRNA_chemical.explode('small melocule')
miRNA_chemical['small melocule'] = miRNA_chemical['small melocule'].str.rstrip().str.lstrip()

print(all(miRNA_chemical['miRBase'].isin(rnacentral_map_human_mirbase['miRBase ID'])))
# These are all entries not belonging to miRBase but to external databases
print(miRNA_chemical[~miRNA_chemical['miRBase'].isin(rnacentral_map_human_mirbase['miRBase ID'])]['miRBase'].unique()[:3])
miRNA_chemical = pd.merge(miRNA_chemical, rnacentral_map_human_mirbase.rename(columns={'miRBase ID':'miRBase'}), on='miRBase').drop(
    columns=['miRBase']).rename(columns={'RNAcentral ID':'RNA'})

miRNA_chemical = pd.merge(miRNA_chemical, desc_chebi_map, left_on=['small melocule'], right_on=[0], how='left').drop(
    columns=[0,'small melocule']).rename(columns={1:'Chemical'})
miRNA_chemical['Chemical'].fillna(miRNA_chemical['DB'], inplace=True)
miRNA_chemical = miRNA_chemical.drop(columns=['DB'])
miRNA_chemical = miRNA_chemical[miRNA_chemical['Chemical'].notna()]

print(miRNA_chemical['Expression pattern of miRNA'].unique())
miRNA_chemical['Expression pattern of miRNA'] = miRNA_chemical['Expression pattern of miRNA'].str.strip().replace({
    'up-regualted': 'up-regulated', 'up-regulated ': 'up-regulated', 'down-regualted': 'down-regulated'})

miRNA_chemical['PMID'] = pd.to_numeric(miRNA_chemical['PMID'], errors='coerce')
miRNA_chemical['PMID'] = miRNA_chemical['PMID'].astype(str)
miRNA_chemical['PMID'] = miRNA_chemical['PMID'].str.replace(".0", "")
miRNA_chemical['PMID'] = miRNA_chemical['PMID'].replace("<NA>", np.nan)

miRNA_chemical['Detection method '] = miRNA_chemical['Detection method '].str.lower()
miRNA_chemical = pd.merge(miRNA_chemical, method_map, right_on='0_y', left_on='Detection method ', how='left')
miRNA_chemical['0_x'] = miRNA_chemical['0_x'].fillna(miRNA_chemical['Detection method '])
miRNA_chemical = miRNA_chemical.drop(columns=['0_y', 'Detection method '])
miRNA_chemical = miRNA_chemical.rename(columns={'0_x':'Method','PMID':'PubMedID', 'regulator':'Regulator', 'target':'Interactor'})

miRNA_chemical['Condition'] = miRNA_chemical['Condition'].str.lower()
miRNA_chemical = pd.merge(miRNA_chemical, location_map, right_on='0_y', left_on='Condition', how='left')
miRNA_chemical['0_x'] = miRNA_chemical['0_x'].fillna(miRNA_chemical['Condition'])
miRNA_chemical = miRNA_chemical.drop(columns=['0_y', 'Condition'])
miRNA_chemical = miRNA_chemical.rename(columns={'0_x':'Location'})

miRNA_chemical['Source'] = 'miRNet, SM2miR'
miRNA_chemical['Source'] =  miRNA_chemical['Source'].str.split(", ")
miRNA_chemical = miRNA_chemical.explode('Source')
miRNA_chemical = miRNA_chemical.rename(columns={'RNA':':END_ID','Chemical':':START_ID'})
miRNA_chemical.head(n=3)

In [None]:
miRNA_chemical_up = miRNA_chemical[miRNA_chemical['Expression pattern of miRNA'] == 'down-regulated'].drop(columns=['Expression pattern of miRNA'])
miRNA_chemical_up.head(n=3)

* [miRandola](http://mirandola.iit.cnr.it/index.php)

In [None]:
miRNA_chemical_mirandola_up = miRNA_chemical_mirandola[miRNA_chemical_mirandola['Expression']=='up'].rename(
    columns={':START_ID':':END_ID',':END_ID':':START_ID'})
miRNA_chemical_mirandola_up.head(n=3)

* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
lncRNA_chemical_chebi_up = lncRNA_chemical_chebi[
    lncRNA_chemical_chebi['expression_detail'] == 'Up-regulated'].drop(columns=['expression_detail'])
lncRNA_chemical_chebi_up = lncRNA_chemical_chebi_up.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
lncRNA_chemical_chebi_up.head(n=1)

In [None]:
OBO_indirectly_positively_regulates_quantity_of_RNA = pd.concat([lncRNA_chemical_chebi_up,
                                                                 miRNA_chemical_mirandola_up,
                                                                 miRNA_chemical_up])
OBO_indirectly_positively_regulates_quantity_of_RNA = OBO_indirectly_positively_regulates_quantity_of_RNA.groupby(
    [':START_ID',':END_ID']).agg({'Regulator':set, 'Interactor':set, 'PubMedID':set, 'Method':set, 'Location':set,'Source':set}).reset_index()
OBO_indirectly_positively_regulates_quantity_of_RNA[":TYPE"] = "indirectly_positively_regulates_quantity_of"
OBO_indirectly_positively_regulates_quantity_of_RNA.to_pickle(unprocessed_edge_data_location+'OBO_indirectly_positively_regulates_quantity_of_RNA.pkl')
OBO_indirectly_positively_regulates_quantity_of_RNA.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0011023 (indirectly negatively regulates quantity of) - RNA

* [SM2miR](http://www.jianglab.cn/SM2miR/)

In [None]:
miRNA_chemical_down = miRNA_chemical[miRNA_chemical['Expression pattern of miRNA'] == 'down-regulated'].drop(
    columns=['Expression pattern of miRNA'])
miRNA_chemical_down.head(n=3)

* [miRandola](http://mirandola.iit.cnr.it/index.php)

In [None]:
miRNA_chemical_mirandola_down = miRNA_chemical_mirandola[miRNA_chemical_mirandola['Expression']=='down'].drop(columns=['Expression'])
miRNA_chemical_mirandola_down = miRNA_chemical_mirandola_down.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
miRNA_chemical_mirandola_down.head(n=3)

* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
lncRNA_chemical_chebi_down = lncRNA_chemical_chebi[
    lncRNA_chemical_chebi['expression_detail'] == 'Down-regulated'].drop(columns=['expression_detail'])
lncRNA_chemical_chebi_down = lncRNA_chemical_chebi_down.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
lncRNA_chemical_chebi_down.head(n=1)

In [None]:
OBO_indirectly_negatively_regulates_quantity_of_RNA = pd.concat([lncRNA_chemical_chebi_down,miRNA_chemical_mirandola_down, miRNA_chemical_down])
OBO_indirectly_negatively_regulates_quantity_of_RNA = OBO_indirectly_negatively_regulates_quantity_of_RNA.groupby(
    [':START_ID',':END_ID']).agg({'Regulator':set, 'Interactor':set, 'PubMedID':set, 'Method':set, 'Location':set,'Source':set}).reset_index()
OBO_indirectly_negatively_regulates_quantity_of_RNA[":TYPE"] = "indirectly_negatively_regulates_quantity_of"
OBO_indirectly_negatively_regulates_quantity_of_RNA.to_pickle(unprocessed_edge_data_location+'OBO_indirectly_negatively_regulates_quantity_of_RNA.pkl')
OBO_indirectly_negatively_regulates_quantity_of_RNA.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002513 (ribosomally translates to) - Small protein

* [LncBook](https://ngdc.cncb.ac.cn/lncbook/) <br /> LncBook accommodates a high-quality collection of human lncRNA genes and transcripts, and incorporates their abundant annotations at different omics levels, thereby enabling users to decipher functional signatures of lncRNAs in human diseases and different biological contexts. 

In [None]:
!wget https://ngdc.cncb.ac.cn/lncbook/files/sprotein_LncBook2.0.csv.gz -O ../resources/processed_data/unprocessed_data/sprotein_LncBook2.0.csv.gz

In [None]:
lncRNA_protein = pd.read_csv(unprocessed_data_location + 'sprotein_LncBook2.0.csv.gz') #  Small protein --> smProt IDs
lncRNA_protein.drop(columns=['Gene ID','Symbol','SmProt Loci','SmProt Protein Sequence'],inplace=True)

lncRNA_protein = pd.merge(lncRNA_protein, rnacentral_map_human_lncbook[['LncBook Transcript ID', 'RNAcentral ID']].rename(
    columns={'LncBook Transcript ID': 'Transcript ID'}).drop_duplicates(), on='Transcript ID').drop(columns=['Transcript ID']).rename(columns={
        'RNAcentral ID':'RNA','SmProt ID':'Small Protein'})

lncRNA_protein['Experimental Evidence'] = lncRNA_protein['Experimental Evidence'].str.lower()
lncRNA_protein = pd.merge(lncRNA_protein, method_map, right_on='0_y', left_on='Experimental Evidence', how='left')
lncRNA_protein['0_x'] = lncRNA_protein['0_x'].fillna(lncRNA_protein['Experimental Evidence'])
lncRNA_protein = lncRNA_protein.drop(columns=['0_y', 'Experimental Evidence'])
lncRNA_protein = lncRNA_protein.rename(columns={'0_x':'Method','PMID':'PubMedID', 'regulator':'Regulator', 'target':'Interactor'})

lncRNA_protein.rename(columns={'Small Protein':':END_ID','RNA':':START_ID'},inplace=True)
lncRNA_protein['Source'] = 'LncBook'
lncRNA_protein.head(n=3)

* [cncRNADB](https://www.rna-society.org/cncrnadb/) <br/> cncRNAdb is a manually curated database of experimentally supported cncRNAs, which aims to provide a resource for efficient manipulation, browsing and analysis of cncRNAs.

In [None]:
!wget https://www.rna-society.org/cncrnadb/download/Translated%20ncRNA.zip -O ../resources/processed_data/unprocessed_data/Translated%20ncRNA.zip

with zipfile.ZipFile(unprocessed_data_location+'Translated ncRNA.zip', 'r') as zip_ref:
    zip_ref.extractall(unprocessed_data_location)

In [None]:
# Translated ncRNA: 
RNA_anatomy = pd.read_excel(unprocessed_data_location + 'Translated ncRNA.xlsx') # Small protein --> cncRNADB

RNA_anatomy = RNA_anatomy[RNA_anatomy.Organism.str.contains('apiens')]
RNA_anatomy = RNA_anatomy[RNA_anatomy['Gene.ID'].notna()]

RNA_anatomy = RNA_anatomy[RNA_anatomy.Notes != 'It has been re-annotated as protein coding gene now']

RNA_anatomy['Source'] = 'cncRNADB'

RNA_anatomy['Pubmed.ID'] = pd.to_numeric(RNA_anatomy['Pubmed.ID'], errors='coerce')
RNA_anatomy['Pubmed.ID'] = RNA_anatomy['Pubmed.ID'].astype(str)
RNA_anatomy['Pubmed.ID'] = RNA_anatomy['Pubmed.ID'].str.replace(".0", "")
RNA_anatomy['Pubmed.ID'] = RNA_anatomy['Pubmed.ID'].replace("<NA>", np.nan)

RNA_anatomy['Tissue/Cell'] = RNA_anatomy['Tissue/Cell'].str.lower().str.split(';')
RNA_anatomy = RNA_anatomy.explode('Tissue/Cell')
RNA_anatomy = pd.merge(RNA_anatomy, location_map, right_on='0_y', left_on='Tissue/Cell', how='left')
RNA_anatomy['0_x'] = RNA_anatomy['0_x'].fillna(RNA_anatomy['Tissue/Cell'])
RNA_anatomy = RNA_anatomy.drop(columns=['0_y', 'Tissue/Cell'])
RNA_anatomy = RNA_anatomy.rename(columns={'0_x':'Location','Pubmed.ID':'PubMedID'})

RNA_anatomy['Method'] = RNA_anatomy['In vivo/vitro assay'].astype(str) + ";" +\
    RNA_anatomy['Low-throughput method'].astype(str) + ";" + RNA_anatomy['High-throuput method'].astype(str)
RNA_anatomy['Method'] = RNA_anatomy['Method'].str.lower()
RNA_anatomy['Method'] = RNA_anatomy['Method'].str.replace("nan;", "").str.replace(";nan", "")
RNA_anatomy['Method'] = RNA_anatomy['Method'].replace("nan", np.nan)
RNA_anatomy['Method'] = RNA_anatomy['Method'].str.split(";")
RNA_anatomy = RNA_anatomy.explode("Method")
RNA_anatomy = pd.merge(RNA_anatomy, method_map, right_on='0_y', left_on='Method', how='left')
RNA_anatomy['0_x'] = RNA_anatomy['0_x'].fillna(RNA_anatomy['Method'])
RNA_anatomy = RNA_anatomy.drop(columns=['0_y', 'Method'])
RNA_anatomy = RNA_anatomy.rename(columns={'0_x':'Method'})

RNA_anatomy.drop(columns=['Name','Chromosome','Start','End','Strand','Peptide_length','Notes','Description of experimental evidence',
                          'Organism','Peptide','Human.gene.stable.ID','Chimpanzee.gene.stable.ID',
                          'Mouse.gene.stable.ID','Drosophila.melanogaster.gene.stable.ID','High-throuput method',
                          'Zebrafish.gene.stable.ID','In vivo/vitro assay','Low-throughput method'],inplace=True)

RNA_anatomy_circbase = RNA_anatomy[RNA_anatomy['Type'] == 'circRNA']
RNA_anatomy_circbase = pd.merge(RNA_anatomy_circbase, ensembl_map[['ensembl_gene_id','symbol']].drop_duplicates(),
                                left_on='Gene.ID', right_on='ensembl_gene_id')
RNA_anatomy_circbase = RNA_anatomy_circbase.merge(symbol_to_circbase.rename(columns={'gene symbol':'symbol'}), on='symbol')
ensembl_rna_type = ensembl_map[['ensembl_gene_id','transcript_stable_id','ensembl_transcript_type']].drop_duplicates()
RNA_anatomy = pd.merge(ensembl_rna_type, RNA_anatomy, right_on=['Type','Gene.ID'],
                       left_on=['ensembl_transcript_type','ensembl_gene_id']).drop(
                           columns=["Gene.ID",'ensembl_transcript_type','ensembl_gene_id','Type'])

RNA_anatomy_rnacentral = pd.merge(RNA_anatomy,
                                  rnacentral_map_human_ensembl[['RNAcentral ID',
                                                                'Ensembl transcript ID']].drop_duplicates(),
                               left_on='transcript_stable_id', right_on='Ensembl transcript ID').drop(columns=['Ensembl transcript ID'])
RNA_anatomy_ensembl = RNA_anatomy[~RNA_anatomy['transcript_stable_id'].isin(RNA_anatomy_rnacentral['transcript_stable_id'])]
RNA_anatomy_rnacentral.drop(columns=['transcript_stable_id'],inplace=True)
print(RNA_anatomy_circbase.head(n=3))
print(RNA_anatomy_ensembl.head(n=3))
print(RNA_anatomy_rnacentral.head(n=3))
RNA_anatomy = pd.concat([RNA_anatomy_rnacentral.rename(columns={'RNAcentral ID':':START_ID'}),
                         RNA_anatomy_ensembl.rename(columns={'transcript_stable_id':':START_ID'}),
                         RNA_anatomy_circbase.rename(columns={'circRNA ID':':START_ID'})]).rename(columns={'cncRNAdb.ID':':END_ID'})
RNA_anatomy.head(n=3)

In [None]:
RNA_ribosomally_translates_to_smallProtein = pd.concat([RNA_anatomy,lncRNA_protein])
RNA_ribosomally_translates_to_smallProtein = RNA_ribosomally_translates_to_smallProtein.groupby(
    [':START_ID',':END_ID']).agg({'PubMedID':set, 'Method':set, 'Location':set,'Source':set}).reset_index()
RNA_ribosomally_translates_to_smallProtein[":TYPE"] = "ribosomally_translates_to"
RNA_ribosomally_translates_to_smallProtein.to_pickle(unprocessed_edge_data_location+'RNA_ribosomally_translates_to_smallProtein.pkl')

smallProtein_ribosomal_translation_of_RNA = RNA_ribosomally_translates_to_smallProtein.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
smallProtein_ribosomal_translation_of_RNA[":TYPE"] = "ribosomal_translation_of"
smallProtein_ribosomal_translation_of_RNA.to_pickle(unprocessed_edge_data_location+'smallProtein_ribosomal_translation_of_RNA.pkl')
smallProtein_ribosomal_translation_of_RNA.head(n=3)

***
### Gene - http://purl.obolibrary.org/obo/RO_0002205 (has gene product) - Small protein

* [LncBook](https://ngdc.cncb.ac.cn/lncbook/) <br /> LncBook accommodates a high-quality collection of human lncRNA genes and transcripts, and incorporates their abundant annotations at different omics levels, thereby enabling users to decipher functional signatures of lncRNAs in human diseases and different biological contexts. 

In [None]:
lncRNA_protein = pd.read_csv(unprocessed_data_location + 'sprotein_LncBook2.0.csv.gz') #  Small protein --> smProt IDs
lncRNA_protein.drop(columns=['Gene ID','SmProt Loci','SmProt Protein Sequence'],inplace=True)

lncRNA_protein = pd.merge(lncRNA_protein, rnacentral_map_human_lncbook[['LncBook Transcript ID', 'RNAcentral ID']].rename(
    columns={'LncBook Transcript ID': 'Transcript ID'}).drop_duplicates(), on='Transcript ID').drop(columns=['Transcript ID']).rename(columns={
        'RNAcentral ID':'RNA','SmProt ID':'Small Protein'})

lncRNA_protein = lncRNA_protein[lncRNA_protein['Symbol'] != "-"][['Symbol','Small Protein']]
lncRNA_protein = pd.merge(lncRNA_protein, symbol_entrez_map.rename(columns={0:'Symbol'}), on='Symbol').drop(columns=['Symbol']).drop_duplicates()
lncRNA_protein.rename(columns={1:':START_ID','Small Protein':':END_ID'},inplace=True)
lncRNA_protein['Source'] = 'LncBook'
lncRNA_protein.head(n=3)

* [cncRNADB](https://www.rna-society.org/cncrnadb/) <br/> cncRNAdb is a manually curated database of experimentally supported cncRNAs, which aims to provide a resource for efficient manipulation, browsing and analysis of cncRNAs.

In [None]:
# Translated ncRNA: 
RNA_anatomy2 = pd.read_excel(unprocessed_data_location + 'Translated ncRNA.xlsx') # Small protein --> cncRNADB

RNA_anatomy2 = RNA_anatomy2[RNA_anatomy2.Organism.str.contains('apiens')]
RNA_anatomy2 = RNA_anatomy2[RNA_anatomy2['Gene.ID'].isin(RNA_anatomy['Gene.ID'])]

RNA_anatomy2 = RNA_anatomy2[RNA_anatomy2.Notes != 'It has been re-annotated as protein coding gene now'][[
    'Gene.ID','cncRNAdb.ID']].drop_duplicates()

RNA_anatomy2 = pd.merge(RNA_anatomy2, ensembl_entrezGene_map[[0,1]], left_on='Gene.ID', right_on=0)

RNA_anatomy2.rename(columns={1:':START_ID','cncRNAdb.ID':':END_ID'},inplace=True)
RNA_anatomy2['Source'] = 'cncRNADB'

RNA_anatomy2.head(n=3)

In [None]:
gene_has_gene_product_smallProtein = pd.concat([RNA_anatomy2,lncRNA_protein])
gene_has_gene_product_smallProtein = gene_has_gene_product_smallProtein.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
gene_has_gene_product_smallProtein[":TYPE"] = "has_gene_product"
gene_has_gene_product_smallProtein.to_pickle(unprocessed_edge_data_location+'gene_has_gene_product_smallProtein.pkl')

smallProtein_gene_product_of_gene = gene_has_gene_product_smallProtein.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
smallProtein_gene_product_of_gene[":TYPE"] = "gene_product_of"
smallProtein_gene_product_of_gene.to_pickle(unprocessed_edge_data_location+'smallProtein_gene_product_of_gene.pkl')
smallProtein_gene_product_of_gene.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002513 (ribosomally translates to) - OBO

* Ensembl

In [None]:
RNA_rib_to_PRO = pd.read_csv(processed_data_location + 'ENSEMBL_TRANSCRIPT_PROTEIN_ONTOLOGY_MAP.txt', sep='\t', header=None)[[0,1,4]]
RNA_rib_to_PRO = RNA_rib_to_PRO[RNA_rib_to_PRO[4] == 'protein-coding'].drop_duplicates()
RNA_rib_to_PRO['Source'] = 'Ensembl'
RNA_rib_to_PRO.rename(columns={0:':START_ID',1:':END_ID'},inplace=True)
RNA_rib_to_PRO.head(n=3)

In [None]:
RNA_rib_to_PRO = RNA_rib_to_PRO.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
RNA_rib_to_PRO[":TYPE"] = "ribosomally_translates_to"
RNA_rib_to_PRO.to_pickle(unprocessed_edge_data_location+'RNA_ribosomally_translates_to_OBO.pkl')

RNA_rib_to_PRO = RNA_rib_to_PRO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
RNA_rib_to_PRO[":TYPE"] = "ribosomal_translation_of"
RNA_rib_to_PRO.to_pickle(unprocessed_edge_data_location+'OBO_ribosomal_translation_of_RNA.pkl')
RNA_rib_to_PRO.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002513?NOT (not ribosomally translates to) - OBO

* [cncRNADB](https://www.rna-society.org/cncrnadb/) <br/> cncRNAdb is a manually curated database of experimentally supported cncRNAs, which aims to provide a resource for efficient manipulation, browsing and analysis of cncRNAs.

In [None]:
!wget -O https://www.rna-society.org/cncrnadb/download/Untranslated%20mRNA.zip -P ../resources/processed_data/unprocessed_data/

with zipfile.ZipFile(unprocessed_data_location+'Untranslated mRNA.zip', 'r') as zip_ref:
    zip_ref.extractall(unprocessed_data_location)

In [None]:
#Untranslated mRNA
RNA_anatomy = pd.concat([pd.read_excel(unprocessed_data_location + 'Regulatory mRNA.xlsx'), # PRO
    pd.read_excel(unprocessed_data_location + 'Scaffold mRNA.xlsx'),
    pd.read_excel(unprocessed_data_location + 'Sponge mRNA.xlsx')])
RNA_anatomy = RNA_anatomy[RNA_anatomy.Organism.str.contains('apiens')]

RNA_anatomy['Source'] = 'cncRNADB'

RNA_anatomy['Pubmed.ID'] = pd.to_numeric(RNA_anatomy['Pubmed.ID'], errors='coerce')
RNA_anatomy['Pubmed.ID'] = RNA_anatomy['Pubmed.ID'].astype(str)
RNA_anatomy['Pubmed.ID'] = RNA_anatomy['Pubmed.ID'].str.replace(".0", "")
RNA_anatomy['Pubmed.ID'] = RNA_anatomy['Pubmed.ID'].replace("<NA>", np.nan)

RNA_anatomy['Tissue/Cell'] = RNA_anatomy['Tissue/Cell'].str.lower().str.split(';')
RNA_anatomy = RNA_anatomy.explode('Tissue/Cell')
RNA_anatomy = pd.merge(RNA_anatomy, location_map, right_on='0_y', left_on='Tissue/Cell', how='left')
RNA_anatomy['0_x'] = RNA_anatomy['0_x'].fillna(RNA_anatomy['Tissue/Cell'])
RNA_anatomy = RNA_anatomy.drop(columns=['0_y', 'Tissue/Cell'])
RNA_anatomy = RNA_anatomy.rename(columns={'0_x':'Location','Pubmed.ID':'PubMedID'})

RNA_anatomy['Method'] = RNA_anatomy['In vivo/vitro assay'].astype(str) + ";" +\
    RNA_anatomy['Low-throughput method'].astype(str) + ";" + RNA_anatomy['High-throuput method'].astype(str)
RNA_anatomy['Method'] = RNA_anatomy['Method'].str.lower()
RNA_anatomy['Method'] = RNA_anatomy['Method'].str.replace("nan;", "").str.replace(";nan", "")
RNA_anatomy['Method'] = RNA_anatomy['Method'].replace("nan", np.nan)
RNA_anatomy['Method'] = RNA_anatomy['Method'].str.split(";")
RNA_anatomy = RNA_anatomy.explode("Method")
RNA_anatomy = pd.merge(RNA_anatomy, method_map, right_on='0_y', left_on='Method', how='left')
RNA_anatomy['0_x'] = RNA_anatomy['0_x'].fillna(RNA_anatomy['Method'])
RNA_anatomy = RNA_anatomy.drop(columns=['0_y', 'Method'])
RNA_anatomy = RNA_anatomy.rename(columns={'0_x':'Method'})

RNA_anatomy = pd.merge(symbol_to_pro, RNA_anatomy, right_on='Name',left_on=0)

ensembl_mrna_type = ensembl_map[['ensembl_gene_id','transcript_stable_id','ensembl_transcript_type']]
ensembl_mrna_type = ensembl_mrna_type[ensembl_mrna_type['ensembl_transcript_type'] == 'protein_coding'].drop_duplicates()

RNA_anatomy = pd.merge(ensembl_mrna_type, RNA_anatomy, right_on='Ensembl.ID',
                       left_on='ensembl_gene_id').drop(
                           columns=["Ensembl.ID",'ensembl_transcript_type','ensembl_gene_id'])

RNA_anatomy = RNA_anatomy.rename(columns={'transcript_stable_id':':START_ID', 1:':END_ID'})
RNA_anatomy.head(n=2)

In [None]:
RNA_anatomy = RNA_anatomy.groupby([':START_ID',':END_ID']).agg({'Source':set,'Location':set,'Method':set}).reset_index()
RNA_anatomy[":TYPE"] = "not_ribosomally_translates_to"
RNA_anatomy.to_pickle(unprocessed_edge_data_location+'RNA_not_ribosomally_translates_to_OBO.pkl')

RNA_anatomy = RNA_anatomy.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
RNA_anatomy[":TYPE"] = "not_ribosomal_translation_of"
RNA_anatomy.to_pickle(unprocessed_edge_data_location+'OBO_not_ribosomal_translation_of_RNA.pkl')
RNA_anatomy.head(n=3)

***
### Gene - http://purl.obolibrary.org/obo/RO_0002513?NOT (not has gene product) - OBO

* [cncRNADB](https://www.rna-society.org/cncrnadb/) <br/> cncRNAdb is a manually curated database of experimentally supported cncRNAs, which aims to provide a resource for efficient manipulation, browsing and analysis of cncRNAs.

In [None]:
RNA_anatomy = pd.concat([pd.read_excel(unprocessed_data_location + 'Regulatory mRNA.xlsx'), # PRO
    pd.read_excel(unprocessed_data_location + 'Scaffold mRNA.xlsx'),
    pd.read_excel(unprocessed_data_location + 'Sponge mRNA.xlsx')])
RNA_anatomy = RNA_anatomy[RNA_anatomy.Organism.str.contains('apiens')]

RNA_anatomy['Source'] = 'cncRNADB'

RNA_anatomy['Pubmed.ID'] = pd.to_numeric(RNA_anatomy['Pubmed.ID'], errors='coerce')
RNA_anatomy['Pubmed.ID'] = RNA_anatomy['Pubmed.ID'].astype(str)
RNA_anatomy['Pubmed.ID'] = RNA_anatomy['Pubmed.ID'].str.replace(".0", "")
RNA_anatomy['Pubmed.ID'] = RNA_anatomy['Pubmed.ID'].replace("<NA>", np.nan)

RNA_anatomy['Tissue/Cell'] = RNA_anatomy['Tissue/Cell'].str.lower().str.split(';')
RNA_anatomy = RNA_anatomy.explode('Tissue/Cell')
RNA_anatomy = pd.merge(RNA_anatomy, location_map, right_on='0_y', left_on='Tissue/Cell', how='left')
RNA_anatomy['0_x'] = RNA_anatomy['0_x'].fillna(RNA_anatomy['Tissue/Cell'])
RNA_anatomy = RNA_anatomy.drop(columns=['0_y', 'Tissue/Cell'])
RNA_anatomy = RNA_anatomy.rename(columns={'0_x':'Location','Pubmed.ID':'PubMedID'})

RNA_anatomy['Method'] = RNA_anatomy['In vivo/vitro assay'].astype(str) + ";" +\
    RNA_anatomy['Low-throughput method'].astype(str) + ";" + RNA_anatomy['High-throuput method'].astype(str)
RNA_anatomy['Method'] = RNA_anatomy['Method'].str.lower()
RNA_anatomy['Method'] = RNA_anatomy['Method'].str.replace("nan;", "").str.replace(";nan", "")
RNA_anatomy['Method'] = RNA_anatomy['Method'].replace("nan", np.nan)
RNA_anatomy['Method'] = RNA_anatomy['Method'].str.split(";")
RNA_anatomy = RNA_anatomy.explode("Method")
RNA_anatomy = pd.merge(RNA_anatomy, method_map, right_on='0_y', left_on='Method', how='left')
RNA_anatomy['0_x'] = RNA_anatomy['0_x'].fillna(RNA_anatomy['Method'])
RNA_anatomy = RNA_anatomy.drop(columns=['0_y', 'Method'])
RNA_anatomy = RNA_anatomy.rename(columns={'0_x':'Method'})

RNA_anatomy = pd.merge(symbol_to_pro, RNA_anatomy, right_on='Name',left_on=0)

ensembl_mrna_type = ensembl_map[['ensembl_gene_id','transcript_stable_id','ensembl_transcript_type']]
ensembl_mrna_type = ensembl_mrna_type[ensembl_mrna_type['ensembl_transcript_type'] == 'protein_coding'].drop_duplicates()

RNA_anatomy = pd.merge(ensembl_mrna_type, RNA_anatomy, right_on='Ensembl.ID',
                       left_on='ensembl_gene_id').drop(
                           columns=["Ensembl.ID",'ensembl_transcript_type','ensembl_gene_id'])
RNA_anatomy.rename(columns={'Entrez.ID':':START_ID', 1:':END_ID'},inplace=True)
RNA_anatomy[':START_ID'] = RNA_anatomy[':START_ID'].astype(str).str.replace(".0","").astype(int)
RNA_anatomy.head(n=3)

In [None]:
RNA_anatomy = RNA_anatomy.groupby([':START_ID',':END_ID']).agg({'Source':set,'Location':set,'Method':set}).reset_index()
RNA_anatomy[":TYPE"] = "not_has_gene_product"
RNA_anatomy.to_pickle(unprocessed_edge_data_location+'gene_not_has_gene_product_OBO.pkl')

RNA_anatomy = RNA_anatomy.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
RNA_anatomy[":TYPE"] = "not_gene_product_of"
RNA_anatomy.to_pickle(unprocessed_edge_data_location+'OBO_not_gene_product_of_gene.pkl')
RNA_anatomy.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002206?NOT (not expressed in) - OBO

* LncBook

In [None]:
lncRNA_expression = pd.read_csv(unprocessed_data_location + 'expression_LncBook2.0.csv.gz') # Biological context (Uberon+GO+CLO+Mondo+GO)
lncRNA_expression.drop(columns=['Symbol','Featured Expression'],inplace=True)
lncRNA_expression.rename(columns={'Normal Tissue/Cell Line': 'UBERON_0000479',
                                 'Organ Development':'GO_0048513',
                                 'Preimplantation Embryo':'GO_0007566',
                                 'Cell Differentiation':'GO_0030154',
                                 'Subcellular Localization':'GO_0051179',
                                 'Exosome':'GO_0070062',
                                 'Cancer Cell Line':'CLO_0009828',
                                 'Virus Infection':'MONDO_0005108',
                                 'Circadian Rhythm':'GO_0007623'},inplace=True)

lncRNA_expression = pd.merge(lncRNA_expression, rnacentral_map_human_lncbook[['RNAcentral ID', 'LncBook Gene ID']].drop_duplicates().rename(
    columns={'LncBook Gene ID':'Gene ID'}), on = 'Gene ID').drop(columns=['Gene ID']).rename(columns={'RNAcentral ID':'RNA'})

# HC
HCfinal=pd.DataFrame()
# NE
NEfinal=pd.DataFrame()
# MC
MCfinal=pd.DataFrame()
# LC
LCfinal=pd.DataFrame()

for i in ['UBERON_0000479','GO_0048513','GO_0007566','GO_0030154','GO_0051179',
          'GO_0070062','CLO_0009828','MONDO_0005108','GO_0007623']:
    HC = lncRNA_expression[['RNA','Expression Capacity',i]][lncRNA_expression[['RNA','Expression Capacity',i]][i]=='HC']
    HC[i]=i
    HC.rename(columns={i:'HC'},inplace=True)
    HCfinal = pd.concat([HCfinal,HC])
    
    NE = lncRNA_expression[['RNA','Expression Capacity',i]][lncRNA_expression[['RNA','Expression Capacity',i]][i]=='NE']
    NE[i]=i
    NE.rename(columns={i:'NE'},inplace=True)
    NEfinal = pd.concat([NEfinal,NE])
    
    MC = lncRNA_expression[['RNA','Expression Capacity',i]][lncRNA_expression[['RNA','Expression Capacity',i]][i]=='MC']
    MC[i]=i
    MC.rename(columns={i:'MC'},inplace=True)
    MCfinal = pd.concat([MCfinal,MC])

    LC = lncRNA_expression[['RNA','Expression Capacity',i]][lncRNA_expression[['RNA','Expression Capacity',i]][i]=='LC']
    LC[i]=i
    LC.rename(columns={i:'LC'},inplace=True)
    LCfinal = pd.concat([LCfinal,LC])
 
HCfinal = HCfinal.rename(columns={'HC':':END_ID', 'RNA':':START_ID'}).drop(columns='Expression Capacity')
LCfinal = LCfinal.rename(columns={'LC':':END_ID', 'RNA':':START_ID'}).drop(columns='Expression Capacity')
MCfinal = MCfinal.rename(columns={'MC':':END_ID', 'RNA':':START_ID'}).drop(columns='Expression Capacity')
NEfinal = NEfinal.rename(columns={'NE':':END_ID', 'RNA':':START_ID'}).drop(columns='Expression Capacity')

'''
HCfinal['Number_of_experiments'] = (HCfinal.groupby(HCfinal.columns.tolist()).transform('size'))
HCfinal = HCfinal[HCfinal['Number_of_experiments'].notna()].drop_duplicates()
LCfinal['Number_of_experiments'] = (LCfinal.groupby(LCfinal.columns.tolist()).transform('size'))
LCfinal = LCfinal[LCfinal['Number_of_experiments'].notna()].drop_duplicates()
MCfinal['Number_of_experiments'] = (MCfinal.groupby(MCfinal.columns.tolist()).transform('size'))
MCfinal = MCfinal[MCfinal['Number_of_experiments'].notna()].drop_duplicates()
NEfinal['Number_of_experiments'] = (NEfinal.groupby(NEfinal.columns.tolist()).transform('size'))
NEfinal = NEfinal[NEfinal['Number_of_experiments'].notna()].drop_duplicates()
'''

HCfinal['Source'] = 'LncBook'
LCfinal['Source'] = 'LncBook'
MCfinal['Source'] = 'LncBook'
NEfinal['Source'] = 'LncBook'

NEfinal.head(n=3)

In [None]:
NEfinal = NEfinal.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
NEfinal[":TYPE"] = "not_expressed_in"
NEfinal.to_pickle(unprocessed_edge_data_location+'RNA_not_expressed_in_OBO.pkl')

NEfinal = NEfinal.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
NEfinal[":TYPE"] = "not_expresses"
NEfinal.to_pickle(unprocessed_edge_data_location+'OBO_not_expresses_RNA.pkl')
NEfinal.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002291(ubiquitously expressed in) - OBO

* LncBook

In [None]:
MCfinal = MCfinal.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
MCfinal[":TYPE"] = "ubiquitously_expressed_in"
MCfinal.to_pickle(unprocessed_edge_data_location+'RNA_ubiquitously_expressed_in_OBO.pkl')
MCfinal = MCfinal.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
MCfinal[":TYPE"] = "ubiquitously_expresses"
MCfinal.to_pickle(unprocessed_edge_data_location+'OBO_ubiquitously_expresses_RNA.pkl')
MCfinal.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002260 (has biological role) - Biological role

* [dbEssLnc](https://esslnc.pufengdu.org/home) <br /> dbEssLnc contains lncRNA annotations; data are constently added by manual screening. 

In [None]:
!wget https://esslnc.pufengdu.org/data/essential%20lncRNA.json --no-check-certificate -O ../resources/processed_data/unprocessed_data/essential%20lncRNA.json

In [None]:
dbEssLnc = pd.read_json(unprocessed_data_location + 'essential lncRNA.json') # NIH Genetics Glossary
lncRNA_role = dbEssLnc[dbEssLnc['Organism']=='Human']
lncRNA_role.drop(columns=['ID','Name','Reason','Aliases','fId','Organism','NCBI_gene_Id','Gene_Ontology_Annotations'], inplace=True)
print(lncRNA_role.Role.unique())
# For grounding purposes
lncRNA_role.replace('Tumor suppressor gene', 'Tumor-Suppressor-Gene', inplace=True)
lncRNA_role = lncRNA_role.fillna('nan')

lncRNA_role = pd.merge(lncRNA_role, rnacentral_map_human_noncode[['RNAcentral ID', 'NONCODE Gene ID']].drop_duplicates().rename(
    columns={'NONCODE Gene ID':'NONCODEId'}), on = 'NONCODEId').drop(columns=['NONCODEId']).rename(columns={'RNAcentral ID':'RNA'})

lncRNA_role['PMID'] = pd.to_numeric(lncRNA_role['PMID'], errors='coerce')
lncRNA_role['PMID'] = lncRNA_role['PMID'].astype(str)
lncRNA_role['PMID'] = lncRNA_role['PMID'].str.replace(".0", "")
lncRNA_role['PMID'] = lncRNA_role['PMID'].replace("<NA>", np.nan)

lncRNA_role['Source'] = 'dbEssLnc'
lncRNA_role.rename(columns={'RNA':':START_ID','Role':':END_ID', 'PMID':'PubMedID'},inplace=True)
lncRNA_role.head(n=3)

In [None]:
lncRNA_role = lncRNA_role.groupby([':START_ID',':END_ID']).agg({'Source':set,'PubMedID':set}).reset_index()
lncRNA_role[":TYPE"] = "has_biological_role"
lncRNA_role.to_pickle(unprocessed_edge_data_location+'RNA_has_biological_role_biologicalRole.pkl')
lncRNA_role.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0009501 (realized in response to) - Reactome

* Reactome

In [None]:
data_downloader("https://reactome.org/download/current/gene_association.reactome.gz", unprocessed_data_location)

In [None]:
go_reactome = pd.read_csv(unprocessed_data_location + 'gene_association.reactome', sep='\t', comment="!", header=None)
go_reactome = go_reactome[(go_reactome[8]=='P') & (go_reactome[12] == "taxon:9606") & (go_reactome[5].str.startswith('REACTOME'))]
go_reactome[5] = go_reactome[5].str.replace('REACTOME:','')
go_reactome[4] = go_reactome[4].str.replace(':','_')
go_reactome = go_reactome[[4,5]]
go_reactome['Source'] = 'Reactome'
go_reactome.rename(columns={4:':START_ID',5:':END_ID'},inplace=True)
go_reactome.head(n=3)

In [None]:
go_reactome = go_reactome.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
go_reactome[":TYPE"] = "realized_in_response_to"
go_reactome.to_pickle(unprocessed_edge_data_location+'OBO_realized_in_response_to_reactome.pkl')
go_reactome.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002180 (has component) - Reactome

* Reactome

In [None]:
go_reactome = pd.read_csv(unprocessed_data_location + 'gene_association.reactome', sep='\t', comment="!", header=None)
go_reactome = go_reactome[(go_reactome[8]=='C') & (go_reactome[12] == "taxon:9606") & (go_reactome[5].str.startswith('REACTOME'))]
go_reactome[5] = go_reactome[5].str.replace('REACTOME:','')
go_reactome[4] = go_reactome[4].str.replace(':','_')
go_reactome = go_reactome[[4,5]]
go_reactome['Source'] = 'Reactome'
go_reactome.rename(columns={4:':START_ID',5:':END_ID'},inplace=True)
go_reactome.head(n=3)

In [None]:
go_reactome = go_reactome.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
go_reactome[":TYPE"] = "has_component"
go_reactome.to_pickle(unprocessed_edge_data_location+'OBO_has_component_reactome.pkl')
go_reactome.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0000085 (has function) - Reactome

* Reactome

In [None]:
go_reactome = pd.read_csv(unprocessed_data_location + 'gene_association.reactome', sep='\t', comment="!", header=None)
go_reactome = go_reactome[(go_reactome[8]=='F') & (go_reactome[12] == "taxon:9606") & (go_reactome[5].str.startswith('REACTOME'))]
go_reactome[5] = go_reactome[5].str.replace('REACTOME:','')
go_reactome[4] = go_reactome[4].str.replace(':','_')
go_reactome = go_reactome[[4,5]]
go_reactome['Source'] = 'Reactome'
go_reactome.rename(columns={4:':START_ID',5:':END_ID'},inplace=True)
go_reactome.head(n=3)

In [None]:
go_reactome = go_reactome.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
go_reactome[":TYPE"] = "has_function"
go_reactome.to_pickle(unprocessed_edge_data_location+'OBO_has_function_reactome.pkl')

go_reactome = go_reactome.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
go_reactome[":TYPE"] = "function_of"
go_reactome.to_pickle(unprocessed_edge_data_location+'reactome_function_of_OBO.pkl')
go_reactome.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0000056 (participates in) - OBO

* [Rfam](http://rfamlive.xfam.org/) <br /> The Rfam database is a collection of RNA families, each represented by multiple sequence alignments, consensus secondary structures and covariance models.

In [None]:
!wget https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/go_annotations/rnacentral_rfam_annotations.tsv.gz -O ../resources/processed_data/unprocessed_data/rnacentral_rfam_annotations.tsv.gz

In [None]:
rfam_go = pd.read_csv(unprocessed_data_location + 'rnacentral_rfam_annotations.tsv.gz', sep='\t',
                               names=['RNAcentral ID', "GO", "Rfam ID"]).drop(columns=['Rfam ID'])
rfam_go = rfam_go[rfam_go['RNAcentral ID'].str.endswith('_9606')]
rfam_go['RNAcentral ID'] = rfam_go['RNAcentral ID'].str.split('_').str[0]
rfam_go['GO'] = rfam_go['GO'].str.replace(':','_')
rfam_go['Source'] = 'Rfam, RNAcentral'
rfam_go['Source'] = rfam_go['Source'].str.split(", ")
rfam_go = rfam_go.explode('Source')
rfam_go.head(n=3)

In [None]:
dict = []
goterms_in_rfam = rfam_go['GO'].unique()

for term in goterms_in_rfam:
    aspect = pd.read_json("https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/" + term.replace("_",":") + "/complete")['results'][0]
    dict.append(aspect.get("aspect"))
goterms_in_rfam_map_relation = pd.DataFrame({'GO':goterms_in_rfam, 'Aspect':dict})

rfam_go = pd.merge(rfam_go, goterms_in_rfam_map_relation, on='GO')
rfam_go.rename(columns={'RNAcentral ID':':START_ID','GO':':END_ID'},inplace=True)
rfam_gobp = rfam_go[rfam_go['Aspect'] =='biological_process'].drop(columns=['Aspect'])
rfam_gomf = rfam_go[rfam_go['Aspect'] =='molecular_function'].drop(columns=['Aspect'])
rfam_gocc = rfam_go[rfam_go['Aspect'] =='cellular_component'].drop(columns=['Aspect'])

rfam_gobp.head(n=3)

* [Ribocentre](https://www.ribocentre.org/) <br />
Ribocentre is designed to contain comprehensive information of all natural ribozymes.

In [None]:
# https://www.ribocentre.org/application/ --> Gene Expression system --> CSV button
ribozyme_go = pd.read_csv(unprocessed_data_location + 'Ribocentre - Application.csv')
print(ribozyme_go['ribozyme name'].unique())

ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.replace('glmS ribozyme','SO_0000374')
ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.replace('hammerhead ribozyme','SO_0000380')
ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.replace('LC ribozyme','SO_0000374')
ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.replace('pistol ribozyme','SO_0000374')
ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.replace('RNase P','SO_0000386')
ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.replace('twister ribozyme','SO_0000374')
ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.replace('VS ribozyme','SO_0000374')

ribozyme_go = ribozyme_go[['ribozyme name', 'pubmed ID']]
ribozyme_go['GO'] = ['nan','nan','GO_0015867', 'GO_0032363', 'GO_0010468', 'GO_0010468', 'GO_0010468', 'GO_2000232',
                         'GO_0010468', 'GO_0010468', 'GO_0003743', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'GO_0010468',
                         'nan', 'nan', 'nan', 'GO_0050790', 'nan', 'nan', 'nan', 'nan', 'nan', 'GO_0050790', 'nan', 'nan', 'nan', 'nan', 'nan']
ribozyme_go = ribozyme_go[ribozyme_go['GO'] != 'nan']
dict = []
goterms_in_ribocentre = ribozyme_go['GO'].unique()
for term in goterms_in_ribocentre:
    aspect = pd.read_json("https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/" + term.replace("_",":") + "/complete")['results'][0]
    dict.append(aspect.get("aspect"))
goterms_in_ribocentre_map_relation = pd.DataFrame({'GO':goterms_in_ribocentre, 'Aspect':dict})

ribozyme_go = pd.merge(ribozyme_go, goterms_in_ribocentre_map_relation, on='GO')
# HDV ribozyme is from delta virus, taxid = 12475

ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.replace("HDV ribozyme", "URS00006C745E, URS00006C1D09")
ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.split(", ")
ribozyme_go = ribozyme_go.explode('ribozyme name')
ribozyme_go = ribozyme_go.rename(columns={'ribozyme name':'RNA'})

ribozyme_go['Source'] = 'Ribocentre'

ribozyme_go['pubmed ID'] = pd.to_numeric(ribozyme_go['pubmed ID'], errors='coerce')
ribozyme_go['pubmed ID'] = ribozyme_go['pubmed ID'].astype(str)
ribozyme_go['pubmed ID'] = ribozyme_go['pubmed ID'].str.replace(".0", "")
ribozyme_go['pubmed ID'] = ribozyme_go['pubmed ID'].replace("nan", np.nan)

ribozyme_go_so = ribozyme_go[ribozyme_go['RNA'].str.startswith('SO')][['RNA','GO','pubmed ID','Aspect','Source']]
ribozyme_go_so_mf = ribozyme_go_so[ribozyme_go_so['Aspect'] == 'molecular_function'].drop(columns=['Aspect'])
ribozyme_go_so_bp = ribozyme_go_so[ribozyme_go_so['Aspect'] == 'biological_process'].drop(columns=['Aspect'])
ribozyme_go_rnacentral = ribozyme_go[~ribozyme_go['RNA'].str.startswith('SO')][['RNA','GO','pubmed ID','Aspect','Source']]
print(ribozyme_go_rnacentral['Aspect'].unique())
ribozyme_go_rnacentral.drop(columns=['Aspect'],inplace=True)
ribozyme_go_rnacentral.rename(columns={'RNA':':START_ID','GO':':END_ID','pubmed ID':'PubMedID'},inplace=True)
ribozyme_go_rnacentral.head()

* [TBDB](https://tbdb.io/) <br /> 

In [None]:
# https://tbdb.io/database/ --> CSV button
riboswitch_gobp = pd.read_csv(unprocessed_data_location+'tbdb.csv', sep=',') # riboswitch -- GO

# Extract only GO terms
gobp = riboswitch_gobp.protein_desc.str.rpartition('[')[2].str.rpartition(']')[0].str.replace(":", "_")
riboswitch_gobp = pd.concat([riboswitch_gobp, gobp.rename('gobp')], axis=1)
riboswitch_gobp = riboswitch_gobp[riboswitch_gobp.gobp.str.contains("GO", na=False)]
riboswitch_gobp = riboswitch_gobp[['accession_url', 'gobp']]
riboswitch_gobp = riboswitch_gobp.rename(columns={'accession_url':':START_ID','gobp':':END_ID'})
riboswitch_gobp['Source'] = 'TBDB'
riboswitch_gobp.head(n=3)

* [ncRDeathDB](https://www.rna-society.org/ncrdeathdb/) <br/> ncRDeathDB includes ncRNA types associated with apoptosis, autophagy, and necrosis.

In [None]:
!wget https://www.rna-society.org/ncrdeathdb/data/allNcRNACelldeathData.xlsx -O ../resources/processed_data/unprocessed_data/allNcRNACelldeathData.xlsx

In [None]:
RNA_pDeath = pd.read_excel(unprocessed_data_location + 'allNcRNACelldeathData.xlsx', dtype={"geneid": "string"}) # GO
print(RNA_pDeath['RNA Category'].unique())
print(RNA_pDeath['Action_Mode'].unique())
print(RNA_pDeath['Pathway'].unique())
RNA_pDeath['Pathway'] = RNA_pDeath['Pathway'].replace({'necrosis': 'GO_0097300', 'autophagy': 'GO_0006914',
                                                       'apoptosis': 'GO_0006915'})
RNA_pDeath.Action_Mode = RNA_pDeath.Action_Mode.str.replace('updown', 'up,down')
RNA_pDeath.Action_Mode = RNA_pDeath.Action_Mode.str.replace('dowm', 'down')
RNA_pDeath.Action_Mode = RNA_pDeath.Action_Mode.str.replace('up ed', 'up')
RNA_pDeath.Action_Mode = RNA_pDeath.Action_Mode.str.split(",")
RNA_pDeath = RNA_pDeath.explode('Action_Mode')
RNA_pDeath.miRBase_ID = RNA_pDeath.miRBase_ID.str.split(",")
RNA_pDeath = RNA_pDeath.explode('miRBase_ID')
RNA_pDeath['RNA Category'] = RNA_pDeath['RNA Category'].str.strip()
RNA_pDeath['miRNA_symbol'] = RNA_pDeath['miRNA_symbol'].str.strip() # This (mislabeled) column is used for identifying lncRNAs and snoRNAs
RNA_pDeath['miRNA_symbol'] = RNA_pDeath['miRNA_symbol'].str.upper()
RNA_pDeath['miRBase_ID'] = RNA_pDeath['miRBase_ID'].str.strip()
RNA_pDeath['Gene_Symbol'] = RNA_pDeath['Gene_Symbol'].str.strip()
RNA_pDeath['Tissue'] = RNA_pDeath['Tissue'].str.strip()
RNA_pDeath = RNA_pDeath[RNA_pDeath.tax_id == 9606] # aggiungere le altre specie
RNA_pDeath.drop(columns=['Description','Description.1','tax_id','Organism','id','miRBase_mature_ID',
                         'geneid','Synonyms','Links','chromosome','map_location','type_of_gene','Full_name_from_nomenclature_authority',
                         'Other_designations'],inplace=True)

RNA_pDeath['PMID'] = pd.to_numeric(RNA_pDeath['PMID'], errors='coerce')
RNA_pDeath['PMID'] = RNA_pDeath['PMID'].astype(str)
RNA_pDeath['PMID'] = RNA_pDeath['PMID'].str.replace(".0", "")
RNA_pDeath['PMID'] = RNA_pDeath['PMID'].replace("nan", np.nan)

RNA_pDeath['Tissue'] = RNA_pDeath['Tissue'].str.lower()
RNA_pDeath = pd.merge(RNA_pDeath, location_map, right_on='0_y', left_on='Tissue', how='left')
RNA_pDeath['0_x'] = RNA_pDeath['0_x'].fillna(RNA_pDeath['Tissue'])
RNA_pDeath['0_x'] = RNA_pDeath['0_x'].replace('',np.nan)
RNA_pDeath = RNA_pDeath.drop(columns=['0_y', 'Tissue'])
RNA_pDeath = RNA_pDeath.rename(columns={'0_x':'Location', 'PMID':'PubMedID'})

RNA_pDeath = RNA_pDeath.rename(columns={'Gene_Symbol':'Interactor'})

RNA_pDeath.head(n=3)

In [None]:
miRNA_pDeath = RNA_pDeath[RNA_pDeath['RNA Category'] == 'miRNA']
# These miRBase entries do no exist
print(miRNA_pDeath[~miRNA_pDeath['miRBase_ID'].isin(rnacentral_map_human_mirbase['miRBase ID'])]['miRBase_ID'].unique()[:5])
miRNA_pDeath = pd.merge(miRNA_pDeath, rnacentral_map_human_mirbase, left_on='miRBase_ID',
                        right_on='miRBase ID').drop(columns=['miRBase_ID','miRBase ID','RNA Category','miRNA_symbol'])
miRNA_pDeath.head(n=3)

In [None]:
RNA_pDeath = RNA_pDeath[RNA_pDeath['RNA Category'] != 'miRNA']
RNA_pDeath = pd.merge(RNA_pDeath, rnacentral_map_human_hgnc[['RNAcentral ID','HGNC symbol']].drop_duplicates(),
                      left_on='miRNA_symbol', right_on='HGNC symbol').drop(columns=['RNA Category','miRBase_ID','HGNC symbol','miRNA_symbol'])
RNA_pDeath.head(n=3)

In [None]:
RNA_pDeath = pd.concat([RNA_pDeath, miRNA_pDeath])
RNA_pDeath['Action_Mode'] = RNA_pDeath['Action_Mode'].fillna('nan')
RNA_pDeath = RNA_pDeath.rename(columns={'Pathway':':END_ID','RNAcentral ID':':START_ID'})
RNA_pDeath['Source'] = 'ncRDeathDB'

In [None]:
RNA_pDeath_participates = RNA_pDeath[RNA_pDeath['Action_Mode'] == 'nan'].drop(columns='Action_Mode')
RNA_pDeath_participates.head(n=3)

* [miRPathDB](https://mpd.bioinf.uni-sb.de/overview.html)

In [None]:
#tar = tarfile.open(unprocessed_data_location+'miRPathDB2_hsa_gmt.tar.gz', 'r:gz')
#tar.extractall(unprocessed_data_location)
#tar.close()

with open(unprocessed_data_location+'hsa/GO_BP_validated_miRTarBase_strong.gmt', 'r') as file: # GO
    data = file.read().rstrip()
    
miRNA_GO = pd.DataFrame([ ln.rstrip().split('\t') for ln in
    io.StringIO(data).readlines() ]).fillna('')

miRNA_GO[0] = miRNA_GO[0].str.lower()
miRNA_GO = miRNA_GO.dropna(axis=1, how='all')
miRNA_GO = miRNA_GO.drop(columns=[1])
miRNA_GO = pd.merge(desc_go_map, miRNA_GO, left_on=[0], right_on=[0])
miRNA_GO = miRNA_GO.dropna(axis=1, how='all')

dflist = list()
for i in range(len(miRNA_GO)):
    df = pd.DataFrame(columns=[0, 1, 2])
    df[0] = miRNA_GO.T[i].drop(index=[0, 1])
    df[1] = miRNA_GO.T[i][0]
    df[2] = miRNA_GO.T[i][1]
    dflist.append(df)

miRNA_GO = pd.DataFrame(columns=[0, 1, 2])
for dataframe in dflist:
    miRNA_GO = pd.concat([miRNA_GO, dataframe])
miRNA_GO = miRNA_GO.dropna()
miRNA_GO = miRNA_GO[miRNA_GO[0] != '']

print(all(miRNA_GO[0].isin(rnacentral_map_human['DB Description'])))
miRNA_GO = pd.merge(miRNA_GO, rnacentral_map_human.rename(columns={'DB Description':0}), on=0).drop(
    columns=[0,1,'DB','DB ID','Organism','RNA category']).rename(columns={'RNAcentral ID':'RNA'})

miRNA_GO['Source'] = 'miRPathDB'
miRNA_GO = miRNA_GO.rename(columns={'RNA':':START_ID',2:':END_ID'})
miRNA_GO.head(n=3)

* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv') # PW
lncRNA_pw = LncRNAWiki[LncRNAWiki['pathway'].notna()]
lncRNA_pw.pathway = lncRNA_pw.pathway.str.lower()
lncRNA_pw = lncRNA_pw.drop(columns=['symbol','synonyms','gene_locus','gene_id','conservation_ortholog','clinical_detail',
                                        'modification_detail','target_interaction', 'conservation_species','target_effect',
                                        'description','conservation','target_type','biological_context','regulator_effect',
                                        'regulator_interaction', 'genome_variation', 'variation_detail', 'molecular_function',
                                        'expression','regulator_type','functional_mechanism'])

lncRNA_pw['transcript_id'] = lncRNA_pw['transcript_id'].str.split(',')
lncRNA_pw = lncRNA_pw.explode('transcript_id')
lncRNA_pw = lncRNA_pw[lncRNA_pw['transcript_id'].notna()]

lncRNA_pw['pathway'] = lncRNA_pw['pathway'].str.split(',')
lncRNA_pw = lncRNA_pw.explode('pathway')
lncRNA_pw = lncRNA_pw[lncRNA_pw['pathway'].notna()]

lncRNA_pw = pd.merge(desc_pw_map.rename(columns={0:'pathway'}), lncRNA_pw, on=['pathway']).drop(
    columns=['pathway']).rename(columns={1:'Pathway'})

lncRNA_pw = pd.merge(lncRNA_pw, rnacentral_map_human_lncbook[['LncBook Transcript ID', 'RNAcentral ID']].drop_duplicates().rename(
    columns={'LncBook Transcript ID':'transcript_id'}), on = 'transcript_id').drop(columns=['transcript_id']).rename(
        columns={'RNAcentral ID':'RNA'})  

lncRNA_pw['pmid'] = pd.to_numeric(lncRNA_pw['pmid'], errors='coerce')
lncRNA_pw['pmid'] = lncRNA_pw['pmid'].astype(str)
lncRNA_pw['pmid'] = lncRNA_pw['pmid'].str.replace(".0", "")
lncRNA_pw['pmid'] = lncRNA_pw['pmid'].replace("nan", np.nan)

lncRNA_pw['Source'] = 'LncRNAWiki'
lncRNA_pw.rename(columns={'RNA':':START_ID','Pathway':':END_ID', 'pmid':'PubMedID'}, inplace=True)
lncRNA_pw.head(n=3)

In [None]:
lncRNA_pw_participates = lncRNA_pw[lncRNA_pw['expression_detail'].isna()].drop(columns=['expression_detail']) # PW
lncRNA_pw_participates.head(n=3)

* [LncRNAWiki](https://ngdc.cncb.ac.cn/lncrnawiki/)

In [None]:
LncRNAWiki = pd.read_csv(unprocessed_data_location+'LncRNAWiki_BrowseDownload.csv') # GO
lncRNA_gobp2 = LncRNAWiki[LncRNAWiki['biological_context'].notna()]
lncRNA_gobp2.biological_context = lncRNA_gobp2.biological_context.str.lower()
lncRNA_gobp2 = lncRNA_gobp2.drop(columns=['symbol','synonyms','gene_locus','gene_id','conservation_ortholog','clinical_detail',
                                        'modification_detail','target_interaction', 'conservation_species','target_effect',
                                        'description','conservation','target_type','regulator_effect',
                                        'regulator_interaction', 'genome_variation', 'variation_detail', 'molecular_function',
                                        'expression','regulator_type','functional_mechanism'])

lncRNA_gobp2['transcript_id'] = lncRNA_gobp2['transcript_id'].str.split(',')
lncRNA_gobp2 = lncRNA_gobp2.explode('transcript_id')
lncRNA_gobp2 = lncRNA_gobp2[lncRNA_gobp2['transcript_id'].notna()]

lncRNA_gobp2['biological_context'] = lncRNA_gobp2['biological_context'].str.split(',')
lncRNA_gobp2 = lncRNA_gobp2.explode('biological_context')
lncRNA_gobp2 = lncRNA_gobp2[lncRNA_gobp2['biological_context'].notna()]

lncRNA_gobp2 = pd.merge(desc_go_map.rename(columns={0:'biological_context'}), lncRNA_gobp2, on=['biological_context']).drop(
    columns=['biological_context']).rename(columns={1:'GO'})

lncRNA_gobp2 = pd.merge(lncRNA_gobp2, rnacentral_map_human_lncbook[['LncBook Transcript ID', 'RNAcentral ID']].drop_duplicates().rename(
    columns={'LncBook Transcript ID':'transcript_id'}), on = 'transcript_id').drop(columns=['transcript_id']).rename(
        columns={'RNAcentral ID':'RNA'})  

lncRNA_gobp2['pmid'] = pd.to_numeric(lncRNA_gobp2['pmid'], errors='coerce')
lncRNA_gobp2['pmid'] = lncRNA_gobp2['pmid'].astype(str)
lncRNA_gobp2['pmid'] = lncRNA_gobp2['pmid'].str.replace(".0", "")
lncRNA_gobp2['pmid'] = lncRNA_gobp2['pmid'].replace("nan", np.nan)

lncRNA_gobp2['Source'] = 'LncRNAWiki'
lncRNA_gobp2 = lncRNA_gobp2.rename(columns={'GO':':END_ID','RNA':':START_ID','pmid':'PubMedID'})
lncRNA_gobp2.head(n=3)

In [None]:
RNA_participates_in_OBO = pd.concat([lncRNA_gobp2, lncRNA_pw_participates, miRNA_GO, RNA_pDeath_participates,
                                     riboswitch_gobp, ribozyme_go_rnacentral, rfam_gobp]).groupby([
                                        ':START_ID',':END_ID']).agg({'Source':set,'PubMedID':set,
                                                                     'Interactor':set,'Location':set}).reset_index()
RNA_participates_in_OBO[":TYPE"] = "participates_in"
RNA_participates_in_OBO.to_pickle(unprocessed_edge_data_location+'RNA_participates_in_OBO.pkl')

OBO_has_participant_RNA = RNA_participates_in_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_has_participant_RNA[":TYPE"] = "has_participant"
OBO_has_participant_RNA.to_pickle(unprocessed_edge_data_location+'OBO_has_participant_RNA.pkl')
OBO_has_participant_RNA.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002528 (is upstream of sequence of) - OBO

* [PuTmiR 1.1](https://www.isical.ac.in/~bioinfo_miu/TF-miRNA1.php) <br/>
PuTmiR is a web server designed for extracting the putative TFs for human miRNAs, as per the requirement of a user, based on genomic locality, i.e., any upstream or downstream region of interest less than 10 kb.

In [None]:
!wget https://www.isical.ac.in/~bioinfo_miu/UpstreamRegionTF-miRNA1.txt --no-check-certificate -O ../resources/processed_data/unprocessed_data/UpstreamRegionTF-miRNA1.txt

In [None]:
miRNA_TF_up = pd.read_csv(unprocessed_data_location+'UpstreamRegionTF-miRNA1.txt', sep='\t') # TF (PRO)
miRNA_TF_up = miRNA_TF_up[miRNA_TF_up['name'] != "-"]
miRNA_TF_up = miRNA_TF_up[miRNA_TF_up['TF'] != "-"]
miRNA_TF_up['Start10kb'] = miRNA_TF_up['Start10kb'].astype(int)
miRNA_TF_up['End10kb'] = miRNA_TF_up['End10kb'].astype(int)
miRNA_TF_up['Distance'] = abs(miRNA_TF_up['Start10kb'] - miRNA_TF_up['End10kb'])
miRNA_TF_up.drop(columns=['chrom','chromStart','chromEnd','strand','Start10kb','End10kb',
                          'chromStartTF','chromEndTF','Refseq','score'],inplace=True)
miRNA_TF_up = miRNA_TF_up.merge(symbol_to_pro.rename(columns={0:'TF'}), on='TF').rename(columns={1:'Protein'})

print(all(miRNA_TF_up['name'].isin(rnacentral_map_human['DB Description'])))
print(miRNA_TF_up[~miRNA_TF_up['name'].isin(rnacentral_map_human['DB Description'])]['name'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = miRNA_TF_up[~miRNA_TF_up['name'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = miRNA_TF_up[~miRNA_TF_up['name'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['name'] = miRNA_RNA_miRNAnotInRNAcentral5p['name'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['name'] = miRNA_RNA_miRNAnotInRNAcentral3p['name'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'name'}), on='name').drop(columns=['name']).rename(columns={'RNAcentral ID':'RNA'})
miRNA_TF_up = pd.merge(miRNA_TF_up, rnacentral_map_human.rename(columns={'DB Description':'name'}), on='name').drop(
    columns=['name']).rename(columns={'RNAcentral ID':'RNA'})
miRNA_TF_up = pd.concat([miRNA_TF_up, miRNA_RNA_miRNAnotInRNAcentral]).drop(columns=['DB','DB ID','Organism','RNA category'])

miRNA_TF_up['zScore'] = miRNA_TF_up['zScore'].astype(float)
miRNA_TF_up['Source'] = 'PuTmiR'
miRNA_TF_up.rename(columns={'Protein':':END_ID','RNA':':START_ID'},inplace=True)
miRNA_TF_up.head(n=3)

In [None]:
miRNA_TF_up = miRNA_TF_up.groupby([':START_ID',':END_ID']).agg({'Source':set,'zScore':np.mean,'Distance':np.mean}).reset_index()
miRNA_TF_up[":TYPE"] = "is_upstream_of_sequence_of"
miRNA_TF_up.to_pickle(unprocessed_edge_data_location+'RNA_is_upstream_of_sequence_of_OBO.pkl')

TF_miRNA_down = miRNA_TF_up.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
TF_miRNA_down[":TYPE"] = "is_downstream_of_sequence_of"
TF_miRNA_down.to_pickle(unprocessed_edge_data_location+'OBO_is_downstream_of_sequence_of_RNA.pkl')
TF_miRNA_down.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002529 (is downstream of sequence of) - OBO

* [PuTmiR 1.1](https://www.isical.ac.in/~bioinfo_miu/TF-miRNA1.php) <br/>
PuTmiR is a web server designed for extracting the putative TFs for human miRNAs, as per the requirement of a user, based on genomic locality, i.e., any upstream or downstream region of interest less than 10 kb.

In [None]:
!wget https://www.isical.ac.in/~bioinfo_miu/DownstreamRegionTF-miRNA1.txt --no-check-certificate -O ../resources/processed_data/unprocessed_dataDownstreamRegionTF-miRNA1.txt

In [None]:
miRNA_TF_down = pd.read_csv(unprocessed_data_location+'DownstreamRegionTF-miRNA1.txt', sep='\t') # TF (PRO)
miRNA_TF_down = miRNA_TF_down[miRNA_TF_down['name'] != "-"]
miRNA_TF_down = miRNA_TF_down[miRNA_TF_down['TF'] != "-"]
miRNA_TF_down['Start10kb'] = miRNA_TF_down['Start10kb'].astype(int)
miRNA_TF_down['End10kb'] = miRNA_TF_down['End10kb'].astype(int)
miRNA_TF_down['Distance'] = abs(miRNA_TF_down['Start10kb'] - miRNA_TF_down['End10kb'])
miRNA_TF_down.drop(columns=['chrom','chromStart','chromEnd','strand','Start10kb','End10kb',
                          'chromStartTF','chromEndTF','Refseq','score'],inplace=True)
miRNA_TF_down = miRNA_TF_down.merge(symbol_to_pro.rename(columns={0:'TF'}), on='TF').rename(columns={1:'Protein'})

print(all(miRNA_TF_down['name'].isin(rnacentral_map_human['DB Description'])))
print(miRNA_TF_down[~miRNA_TF_down['name'].isin(rnacentral_map_human['DB Description'])]['name'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = miRNA_TF_down[~miRNA_TF_down['name'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = miRNA_TF_down[~miRNA_TF_down['name'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['name'] = miRNA_RNA_miRNAnotInRNAcentral5p['name'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['name'] = miRNA_RNA_miRNAnotInRNAcentral3p['name'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'name'}), on='name').drop(columns=['name']).rename(columns={'RNAcentral ID':'RNA'})
miRNA_TF_down = pd.merge(miRNA_TF_down, rnacentral_map_human.rename(columns={'DB Description':'name'}), on='name').drop(
    columns=['name']).rename(columns={'RNAcentral ID':'RNA'})
miRNA_TF_down = pd.concat([miRNA_TF_down, miRNA_RNA_miRNAnotInRNAcentral]).drop(columns=['DB','DB ID','Organism','RNA category'])

miRNA_TF_down['zScore'] = miRNA_TF_down['zScore'].astype(float)
miRNA_TF_down['Source'] = 'PuTmiR'
miRNA_TF_down.rename(columns={'Protein':':END_ID','RNA':':START_ID'},inplace=True)
miRNA_TF_down.head(n=3)

* [TBDB](https://tbdb.io/) <br /> 

TBDB contains T-box riboswitch fold prediction, tRNA pairs from host organisms, information regarding T-box riboswitch genetic context, and thermodynamic calculations of putative T-box riboswitch sequences found in nature.

In [None]:
!wget https://tbdb.io/database/tbdb.csv -O ../resources/processed_data/unprocessed_data/tbdb.csv

In [None]:
riboswitch_protein = pd.read_csv(unprocessed_data_location+'tbdb.csv') # Riboswitch (NCBI nuccore?genomic_location) - PRO
riboswitch_protein = riboswitch_protein[riboswitch_protein['downstream_protein'].notna()]
riboswitch_protein.downstream_protein = riboswitch_protein.downstream_protein.str.lower()
riboswitch_protein['Source'] = 'TBDB' 
riboswitch_protein.head(n=3)

In [None]:
mithocondrial_proteins = desc_pro_map_all[desc_pro_map_all[0].astype(str).str.contains("mitochondrial")][1].unique()
riboswitch_protein = pd.merge(riboswitch_protein, desc_pro_map_all[~desc_pro_map_all[1].isin(mithocondrial_proteins)],
                              left_on=['downstream_protein'], right_on=[0])
riboswitch_protein.drop(columns=[0],inplace=True)
riboswitch_protein = riboswitch_protein.drop(columns=['downstream_protein'])
riboswitch_protein.rename(columns={1:':END_ID', 'accession_url':':START_ID'},inplace=True)
riboswitch_protein.head(n=3)

In [None]:
RNA_is_downstream_of_sequence_of_OBO = pd.concat([riboswitch_protein, miRNA_TF_down])
RNA_is_downstream_of_sequence_of_OBO = RNA_is_downstream_of_sequence_of_OBO.groupby([':START_ID',':END_ID']).agg(
    {'Source':set,'zScore':np.mean,'Distance':np.mean}).reset_index()
RNA_is_downstream_of_sequence_of_OBO[":TYPE"] = "is_downstream_of_sequence_of"
RNA_is_downstream_of_sequence_of_OBO.to_pickle(unprocessed_edge_data_location+'RNA_is_downstream_of_sequence_of_OBO.pkl')

OBO_is_upstream_of_sequence_of_RNA = RNA_is_downstream_of_sequence_of_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_is_upstream_of_sequence_of_RNA[":TYPE"] = "is_upstream_of_sequence_of"
OBO_is_upstream_of_sequence_of_RNA.to_pickle(unprocessed_edge_data_location+'OBO_is_upstream_of_sequence_of_RNA.pkl')
OBO_is_upstream_of_sequence_of_RNA.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0011008 (increases by expression quantity of) - RNA

* [TransmiR](https://www.cuilab.cn/transmir) <br /> TransmiR is a database for transcription factor (TF)-microRNA (miRNA) regulations, through which one can find regulatory relations between TFs and miRNAs.

In [None]:
!wget http://www.cuilab.cn/files/transmir3/download/H.sapiens.tsv.gz --no-check-certificate -O ../resources/processed_data/unprocessed_data/H.sapiens.tsv.gz

In [None]:
miRNA_TF2 = pd.read_csv(unprocessed_data_location+"H.sapiens.tsv.gz", sep="\t").drop(columns=['tss','spe','gid']) # TF (PRO)
miRNA_TF2 = miRNA_TF2.merge(symbol_to_pro.rename(columns={0:'tf'}), on='tf').rename(columns={1:'Protein'}).drop(columns=['tf'])

print(all(miRNA_TF2['mir'].isin(rnacentral_map_human['DB Description'])))
print(miRNA_TF2[~miRNA_TF2['mir'].isin(rnacentral_map_human['DB Description'])]['mir'].str[:3].unique())
miRNA_TF2 = miRNA_TF2[miRNA_TF2['mir'].str.startswith('hsa')]
miRNA_RNA_miRNAnotInRNAcentral3p = miRNA_TF2[~miRNA_TF2['mir'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = miRNA_TF2[~miRNA_TF2['mir'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['mir'] = miRNA_RNA_miRNAnotInRNAcentral5p['mir'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['mir'] = miRNA_RNA_miRNAnotInRNAcentral3p['mir'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'mir'}), on='mir').drop(columns=['mir']).rename(columns={'RNAcentral ID':'RNA'})
miRNA_TF2 = pd.merge(miRNA_TF2, rnacentral_map_human.rename(columns={'DB Description':'mir'}), on='mir').drop(
    columns=['mir']).rename(columns={'RNAcentral ID':'RNA'})
miRNA_TF2 = pd.concat([miRNA_TF2, miRNA_RNA_miRNAnotInRNAcentral]).drop(columns=['DB','DB ID','Organism','RNA category'])

miRNA_TF2['active'] = miRNA_TF2['active'].str.lower()
print(miRNA_TF2['active'].unique())
miRNA_TF2['active'] = miRNA_TF2['active'].map({
    'repression': 'Repression',
    'activation(feedback)': 'Activation',
    'activation': 'Activation',
    'regulation': 'Regulation',
    'repression(feedback)': 'Repression',
    'regulation(feedback)': 'Regulation',
    'autoregulatory negative feedback loop(feedback)': 'Regulation',
    'auto-regulatory feedback circuit': 'Regulation',
    'regulatory network': 'Regulation',
    'activation(negative regulatory loop)': 'Activation',
    'regulatory loop': 'Regulation',
    'activation(a negative feedback loop)': 'Activation',
    'regulation(double-negative Feedback Loop)': 'Regulation'
})

miRNA_TF2_lit = miRNA_TF2[miRNA_TF2['evi'] == 'literature']
miRNA_TF2 = miRNA_TF2[miRNA_TF2['evi'] != 'literature']
miRNA_TF2.drop(columns=['evi'],inplace=True)
miRNA_TF2_lit = miRNA_TF2_lit.rename(columns={'idnum':'PMID'})
miRNA_TF2_lit.drop(columns=['site','evi','tis','emethod'],inplace=True)
miRNA_TF2_lit['Source'] = 'miRNet, TransmiR'
miRNA_TF2['Source'] = 'TransmiR'
miRNA_TF2 = pd.merge(miRNA_TF2, miRNA_TF2_lit, on=['active','Protein','RNA'], how='outer')
miRNA_TF2['Source'] = miRNA_TF2['Source_x'].astype(str) + ', ' + miRNA_TF2['Source_y'].astype(str)

miRNA_TF2.drop(columns=['Source_x','Source_y'],inplace=True)
miRNA_TF2['Source'] = miRNA_TF2['Source'].str.split(', ')
miRNA_TF2 = miRNA_TF2.explode('Source')
miRNA_TF2 = miRNA_TF2[miRNA_TF2['Source']!='nan']
miRNA_TF2.rename(columns={'active':'action_type','site':'Site','tis':'Tissue/cell_line','emethod':'Experimental_method'},inplace=True)

miRNA_TF2['Experimental_method'] = miRNA_TF2['Experimental_method'].str.lower()
miRNA_TF2 = pd.merge(miRNA_TF2, method_map, right_on='0_y', left_on='Experimental_method', how='left')
miRNA_TF2['0_x'] = miRNA_TF2['0_x'].fillna(miRNA_TF2['Experimental_method'])
miRNA_TF2 = miRNA_TF2.drop(columns=['0_y', 'Experimental_method'])
miRNA_TF2 = miRNA_TF2.rename(columns={'0_x':'Method'})

miRNA_TF2['Tissue/cell_line'] = miRNA_TF2['Tissue/cell_line'].str.lower()
miRNA_TF2 = pd.merge(miRNA_TF2, location_map, right_on='0_y', left_on='Tissue/cell_line', how='left')
miRNA_TF2['0_x'] = miRNA_TF2['0_x'].fillna(miRNA_TF2['Tissue/cell_line'])
miRNA_TF2 = miRNA_TF2.drop(columns=['0_y', 'Tissue/cell_line'])
miRNA_TF2 = miRNA_TF2.rename(columns={'0_x':'Location'})

miRNA_TF2.drop(columns=['Site'],inplace=True)
miRNA_TF2.rename(columns={'Protein':':START_ID','RNA':':END_ID'},inplace=True)

miRNA_TF2.head(n=3)

In [None]:
miRNA_TF2_up = miRNA_TF2[miRNA_TF2['action_type'] == 'Activation'].drop(columns=['action_type'])
miRNA_TF2_up.head(n=3)

In [None]:
miRNA_TF2_up = miRNA_TF2_up.groupby([':START_ID',':END_ID']).agg({'Source':set,'Location':set,'Method':set}).reset_index()
miRNA_TF2_up[":TYPE"] = "increases_by_expression_quantity_of"
miRNA_TF2_up.to_pickle(unprocessed_edge_data_location+'OBO_increases_by_expression_quantity_of_RNA.pkl')
miRNA_TF2_up.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0011007 (decreases by repression quantity of) - RNA


In [None]:
miRNA_TF2_down = miRNA_TF2[miRNA_TF2['action_type'] == 'Repression'].drop(columns=['action_type']) # TF (PRO)
miRNA_TF2_down.head(n=3)

In [None]:
miRNA_TF2_down = miRNA_TF2_down.groupby([':START_ID',':END_ID']).agg({'Source':set,'Location':set,'Method':set}).reset_index()
miRNA_TF2_down[":TYPE"] = "decreases_by_expression_quantity_of"
miRNA_TF2_down.to_pickle(unprocessed_edge_data_location+'OBO_decreases_by_expression_quantity_of_RNA.pkl')
miRNA_TF2_down.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0011003 (regulates quantity of) - RNA

In [None]:
miRNA_TF2_reg = miRNA_TF2[miRNA_TF2['action_type'] == 'Regulation'].drop(columns=['action_type']) # TF (PRO)
miRNA_TF2_reg.head(n=3)

* [TAM](http://www.lirmed.com/tam2/)

In [None]:
with open(unprocessed_data_location+'mirset_v9.txt', 'r') as file:
    data = file.read().rstrip()
    
TAM = pd.DataFrame([ ln.rstrip().split('\t') for ln in
                        io.StringIO(data).readlines() ]).fillna('')
TAM = TAM.dropna(axis=1, how='all')

miRNA_TF2=TAM[(TAM[0].str.contains("TF"))]
miRNA_TF2=miRNA_TF2.dropna(axis=1, how='all')
miRNA_TF2=miRNA_TF2.drop(columns=[0])
miRNA_TF2['merged'] = miRNA_TF2[miRNA_TF2.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
miRNA_TF2=miRNA_TF2[[1,'merged']]

miRNA_TF2['merged'] = miRNA_TF2.merged.str.split(',')
miRNA_TF2 = miRNA_TF2.explode('merged')
miRNA_TF2 = miRNA_TF2[miRNA_TF2.merged != '']

print(all(miRNA_TF2['merged'].isin(rnacentral_map_human['DB Description'])))
miRNA_TF2 = pd.merge(miRNA_TF2, rnacentral_map_human.rename(columns={'DB Description':'merged'}), on='merged').drop(
    columns=['merged','DB','DB ID','Organism','RNA category']).rename(columns={'RNAcentral ID':'RNA'})
miRNA_TF2 = pd.merge(miRNA_TF2, symbol_to_pro.rename(columns={0:1,1:'Protein'}), on=[1]).drop(columns=[1])

miRNA_TF2['Source'] = 'TAM'
miRNA_TF2 = miRNA_TF2.rename(columns={'RNA':':END_ID','Protein':':START_ID'})
miRNA_TF2.head(n=3)

In [None]:
OBO_regulates_quantity_of_RNA = pd.concat([miRNA_TF2, miRNA_TF2_reg])
OBO_regulates_quantity_of_RNA = OBO_regulates_quantity_of_RNA.groupby([':START_ID',':END_ID']).agg(
    {'Source':set,'Location':set,'Method':set}).reset_index()
OBO_regulates_quantity_of_RNA[":TYPE"] = "regulates_quantity_of"
OBO_regulates_quantity_of_RNA.to_pickle(unprocessed_edge_data_location+'OBO_regulates_quantity_of_RNA.pkl')
OBO_regulates_quantity_of_RNA.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0000085 (has function) - OBO

* [Rfam](http://rfamlive.xfam.org/) <br /> The Rfam database is a collection of RNA families, each represented by multiple sequence alignments, consensus secondary structures and covariance models.

In [None]:
rfam_gomf.head(n=3)

* [TAM](http://www.lirmed.com/tam2/) <br /> TAM groups miRNAs into six categories of miRNA sets: miRNA-family sets, miRNA cluster sets, miRNA-disease, miRNA-function sets, miRNA-TF sets and tissue specificity sets.

In [None]:
with open(unprocessed_data_location+'mirset_v9.txt', 'r') as file:
    data = file.read().rstrip()
    
TAM = pd.DataFrame([ ln.rstrip().split('\t') for ln in
    io.StringIO(data).readlines() ]).fillna('')
TAM=TAM.dropna(axis=1, how='all')

miRNA_GO=TAM[TAM[0].str.contains("unction")] # GO
miRNA_GO[1] = miRNA_GO[1].str.lower()
miRNA_GO=miRNA_GO.dropna(axis=1, how='all')
miRNA_GO=miRNA_GO.drop(columns=[0])

miRNA_GO['merged'] = miRNA_GO[miRNA_GO.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
miRNA_GO=miRNA_GO[[1,'merged']]

miRNA_GO['merged'] = miRNA_GO.merged.str.split(',')
miRNA_GO = miRNA_GO.explode('merged')
miRNA_GO = miRNA_GO[miRNA_GO.merged != '']

print(all(miRNA_GO['merged'].isin(rnacentral_map_human['DB Description'])))
# These are all dead hairpin entries
print(miRNA_GO[~miRNA_GO['merged'].isin(rnacentral_map_human['DB Description'])])
miRNA_GO = pd.merge(miRNA_GO, rnacentral_map_human.rename(columns={'DB Description':'merged'}), on='merged').drop(
    columns=['merged','DB','DB ID','Organism','RNA category']).rename(columns={'RNAcentral ID':'RNA'})
miRNA_GO = pd.merge(miRNA_GO, desc_go_map.rename(columns={0:1,1:'GO'}), on=[1]).drop(columns=[1])

miRNA_GO['Source'] = 'TAM'
miRNA_GO = miRNA_GO.rename(columns={'RNA':':START_ID','GO':':END_ID'})
miRNA_GO.head(n=3)

* [miRPathDB](https://mpd.bioinf.uni-sb.de/overview.html) <br /> miRPathDB includes miRNA candidates, experimentally validated target genes, extended analysis functionality, and intuitive visualizations of query results. 

In [None]:
!wget https://mpd.bioinf.uni-sb.de/download/version_2/miRPathDB2_hsa_gmt.tar.gz -O ../resources/processed_data/unprocessed_data/miRPathDB2_hsa_gmt.tar.gz

tar = tarfile.open(unprocessed_data_location+'miRPathDB2_hsa_gmt.tar.gz', 'r:gz')
tar.extractall(unprocessed_data_location)
tar.close()

In [None]:
with open(unprocessed_data_location+'hsa/GO_MF_validated_miRTarBase_strong.gmt', 'r') as file:
    data = file.read().rstrip()
    
miRNA_GO2 = pd.DataFrame([ ln.rstrip().split('\t') for ln in
    io.StringIO(data).readlines() ]).fillna('')
    
miRNA_GO2[0] = miRNA_GO2[0].str.lower()
miRNA_GO2=miRNA_GO2.dropna(axis=1, how='all')
miRNA_GO2=miRNA_GO2.drop(columns=[1])
miRNA_GO2 = pd.merge(desc_go_map, miRNA_GO2, left_on=[0], right_on=[0])
miRNA_GO2 = miRNA_GO2.dropna(axis=1, how='all')

dflist = list()
for i in range(len(miRNA_GO2)):
    df=pd.DataFrame(columns=[0,1,2])
    df[0] = miRNA_GO2.T[i].drop(index=[0,1])
    df[1] = miRNA_GO2.T[i][0]
    df[2] = miRNA_GO2.T[i][1]
    dflist.append(df)

miRNA_GO2=pd.DataFrame(columns=[0,1,2])
for dataframe in dflist:
    miRNA_GO2=pd.concat([miRNA_GO2,dataframe])
miRNA_GO2 = miRNA_GO2.dropna()
miRNA_GO2 = miRNA_GO2[miRNA_GO2[0] != '']

print(all(miRNA_GO2[0].isin(rnacentral_map_human['DB Description'])))
miRNA_GO2 = pd.merge(miRNA_GO2, rnacentral_map_human.rename(columns={'DB Description':0}), on=0).drop(
    columns=[0,1,'DB','DB ID','Organism','RNA category']).rename(columns={'RNAcentral ID':'RNA'})

miRNA_GO2['Source'] = 'miRPathDB'
miRNA_GO2 = miRNA_GO2.rename(columns={'RNA':':START_ID',2:':END_ID'})
miRNA_GO2.head(n=3)

In [None]:
RNA_has_function_OBO = pd.concat([miRNA_GO2, miRNA_GO, rfam_gomf])
RNA_has_function_OBO = RNA_has_function_OBO.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
RNA_has_function_OBO[":TYPE"] = "has_function"
RNA_has_function_OBO.to_pickle(unprocessed_edge_data_location+'RNA_has_function_OBO.pkl')
OBO_function_of_RNA = RNA_has_function_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_function_of_RNA[":TYPE"] = "function_of"
OBO_function_of_RNA.to_pickle(unprocessed_edge_data_location+'OBO_function_of_RNA.pkl')
OBO_function_of_RNA.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0000056 (participates in) - Reactome

* Reactome

In [None]:
data_downloader("https://reactome.org/download/current/ChEBI2Reactome_All_Levels.txt", unprocessed_data_location)

In [None]:
chemical_reactome = pd.read_csv(unprocessed_data_location+'ChEBI2Reactome_All_Levels.txt', sep='\t', header=None)
chemical_reactome = chemical_reactome[chemical_reactome[5] == 'Homo sapiens']
chemical_reactome[0] = 'CHEBI_' + chemical_reactome[0].astype(str)
chemical_reactome[2] = chemical_reactome[2].str.replace("https://reactome.org/PathwayBrowser/#/","")
chemical_reactome = chemical_reactome[[0,2,4]].rename(columns={0:':START_ID', 2:':END_ID', 4:'GO_evidence'})
chemical_reactome['Source'] = 'Reactome'
chemical_reactome.head(n=3)

In [None]:
data_downloader("https://reactome.org/download/current/UniProt2Reactome_All_Levels.txt", unprocessed_data_location)

In [None]:
pro_reactome = pd.read_csv(unprocessed_data_location+'UniProt2Reactome_All_Levels.txt', sep='\t', header=None)
pro_reactome = pro_reactome[pro_reactome[5] == 'Homo sapiens']
pro_reactome = pd.merge(pro_reactome, unipro_pro_map, on=0)
pro_reactome[2] = pro_reactome[2].str.replace("https://reactome.org/PathwayBrowser/#/","")
pro_reactome = pro_reactome[['1_y',2,4]].rename(columns={'1_y':':START_ID', 2:':END_ID', 4:'GO_evidence'})
pro_reactome['Source'] = 'Reactome'
pro_reactome.head(n=3)

In [None]:
OBO_participates_in_reactome = pd.concat([chemical_reactome, pro_reactome])
OBO_participates_in_reactome = OBO_participates_in_reactome.groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
OBO_participates_in_reactome[":TYPE"] = "participates_in"
OBO_participates_in_reactome.to_pickle(unprocessed_edge_data_location+'OBO_participates_in_reactome.pkl')
reactome_has_participant_OBO = OBO_participates_in_reactome.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
reactome_has_participant_OBO[":TYPE"] = "has_participant"
reactome_has_participant_OBO.to_pickle(unprocessed_edge_data_location+'reactome_has_participant_OBO.pkl')
reactome_has_participant_OBO.head(n=3)

***
### Gene - http://purl.obolibrary.org/obo/RO_0000056 (participates in) - Reactome

* CTD

In [None]:
data_downloader("https://ctdbase.org/reports/CTD_genes_pathways.tsv.gz", unprocessed_data_location)

In [None]:
ctd_pathway = pd.read_csv(unprocessed_data_location+'CTD_genes_pathways.tsv', sep='\t', comment="#",
                          names=['GeneSymbol','GeneID','PathwayName','PathwayID'])

ctd_pathway = ctd_pathway[['GeneID','PathwayID']]
ctd_pathway = ctd_pathway[ctd_pathway['PathwayID'].str.startswith('REACT:R-HSA-')]
ctd_pathway['PathwayID'] = ctd_pathway['PathwayID'].str.replace('REACT:','')
ctd_pathway.rename(columns={'GeneID':':START_ID','PathwayID':':END_ID'},inplace=True)
ctd_pathway['Source'] = "CTD"
ctd_pathway.head(n=3)

In [None]:
ctd_pathway = ctd_pathway.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
ctd_pathway[":TYPE"] = "participates_in"
ctd_pathway.to_pickle(unprocessed_edge_data_location+'gene_participates_in_reactome.pkl')

ctd_pathway = ctd_pathway.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
ctd_pathway[":TYPE"] = "has_participant"
ctd_pathway.to_pickle(unprocessed_edge_data_location+'reactome_has_participant_gene.pkl')
ctd_pathway.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0000056 (participates in) - Reactome

* [miRPathDB](https://mpd.bioinf.uni-sb.de/overview.html)

In [None]:
with open(unprocessed_data_location+'hsa/KEGG_validated_miRTarBase_strong.gmt', 'r') as file:
    data = file.read().rstrip()
    
miRNA_pw = pd.DataFrame([ ln.rstrip().split('\t') for ln in
    io.StringIO(data).readlines() ]).fillna('')
    
miRNA_pw = miRNA_pw.dropna(axis=1, how='all')
miRNA_pw[0] = miRNA_pw[0].str.lower()
miRNA_pw = miRNA_pw.drop(columns=[1])
miRNA_pw = pd.merge(desc_reactome_map, miRNA_pw, on=[0])
miRNA_pw = miRNA_pw.dropna(axis=1, how='all')

dflist = list()
for i in range(len(miRNA_pw)):
    df = pd.DataFrame(columns=[0, 1, 2])
    df[0] = miRNA_pw.T[i].drop(index=[0, 1])
    df[1] = miRNA_pw.T[i][0]
    df[2] = miRNA_pw.T[i][1]
    dflist.append(df)

miRNA_pw = pd.DataFrame(columns=[0, 1, 2])
for dataframe in dflist:
    miRNA_pw = pd.concat([miRNA_pw, dataframe])
miRNA_pw = miRNA_pw.dropna()
miRNA_pw = miRNA_pw[miRNA_pw[0] != '']

miRNA_pw = pd.merge(miRNA_pw, rnacentral_map_human.rename(columns={'DB Description':0}), on=0).drop(
    columns=[0,1,'DB','DB ID','Organism','RNA category']).rename(columns={'RNAcentral ID':':START_ID', 2:':END_ID'})
miRNA_pw['Source'] = 'miRPathDB'
miRNA_pw.head(n=3)

In [None]:
#tar = tarfile.open(unprocessed_data_location+'miRPathDB2_hsa_gmt.tar.gz', 'r:gz')
#tar.extractall(unprocessed_data_location)
#tar.close()

with open(unprocessed_data_location+'hsa/REACTOME_validated_miRTarBase_strong.gmt', 'r') as file:
    data = file.read().rstrip()
    
miRNA_pw2 = pd.DataFrame([ ln.rstrip().split('\t') for ln in
    io.StringIO(data).readlines() ]).fillna('')

miRNA_pw2 = miRNA_pw2.drop(columns=[1])
miRNA_pw2[0] = miRNA_pw2[0].str.lower()
miRNA_pw2 = pd.merge(desc_reactome_map, miRNA_pw2, on=[0])
miRNA_pw2 = miRNA_pw2.dropna(axis=1, how='all')

dflist = list()
for i in range(len(miRNA_pw2)):
    df = pd.DataFrame(columns=[0, 1, 2])
    df[0] = miRNA_pw2.T[i].drop(index=[0, 1])
    df[1] = miRNA_pw2.T[i][0]
    df[2] = miRNA_pw2.T[i][1]
    dflist.append(df)

miRNA_pw2 = pd.DataFrame(columns=[0, 1, 2])
for dataframe in dflist:
    miRNA_pw2 = pd.concat([miRNA_pw2, dataframe])
miRNA_pw2 = miRNA_pw2.dropna()
miRNA_pw2 = miRNA_pw2[miRNA_pw2[0] != '']

print(all(miRNA_pw2[0].isin(rnacentral_map_human['DB Description'])))
miRNA_pw2 = pd.merge(miRNA_pw2, rnacentral_map_human.rename(columns={'DB Description':0}), on=0).drop(
    columns=[0,1,'DB','DB ID','Organism','RNA category']).rename(columns={'RNAcentral ID':'RNA'})

miRNA_pw2['Source'] = 'miRPathDB'
miRNA_pw2 = miRNA_pw2.rename(columns={'RNA':':START_ID',2:':END_ID'})
miRNA_pw2.head(n=3)

In [None]:
miRNA_pw = pd.concat([miRNA_pw, miRNA_pw2])
miRNA_pw = miRNA_pw.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
miRNA_pw[":TYPE"] = "participates_in"
miRNA_pw.to_pickle(unprocessed_edge_data_location+'RNA_participates_in_reactome.pkl')

miRNA_pw = miRNA_pw.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
miRNA_pw[":TYPE"] = "has_participant"
miRNA_pw.to_pickle(unprocessed_edge_data_location+'reactome_has_participant_RNA.pkl')
miRNA_pw.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0000056 (participates in) - WikiPathways

* [miRPathDB](https://mpd.bioinf.uni-sb.de/overview.html)

In [None]:
with open(unprocessed_data_location+'hsa/WIKIPATHWAYS_validated_miRTarBase_strong.gmt', 'r') as file:
    data = file.read().rstrip()
    
miRNA_pw = pd.DataFrame([ ln.rstrip().split('\t') for ln in
    io.StringIO(data).readlines() ]).fillna('')

miRNA_pw[0] = miRNA_pw[0].str.replace('-', ' ').str.lower()
miRNA_pw[0] = miRNA_pw[0].str.replace('/', ' ')
miRNA_pw[0] = miRNA_pw[0].str.replace(':', ' ')
miRNA_pw[0] = miRNA_pw[0].str.replace(r'\(.*\)', '', regex=True)
miRNA_pw.head(n=1)

In [None]:
miRNA_pw = miRNA_pw.dropna(axis=1, how='all')
miRNA_pw = miRNA_pw.drop(columns=[1])
miRNA_pw = pd.merge(desc_wpw_map, miRNA_pw, on=[0])
miRNA_pw = miRNA_pw.dropna(axis=1, how='all')

dflist = list()
for i in range(len(miRNA_pw)):
    df = pd.DataFrame(columns=[0, 1, 2])
    df[0] = miRNA_pw.T[i].drop(index=[0, 1])
    df[1] = miRNA_pw.T[i][0]
    df[2] = miRNA_pw.T[i][1]
    dflist.append(df)

miRNA_pw = pd.DataFrame(columns=[0, 1, 2])
for dataframe in dflist:
    miRNA_pw = pd.concat([miRNA_pw, dataframe])
miRNA_pw = miRNA_pw.dropna()
miRNA_pw = miRNA_pw[miRNA_pw[0] != '']

print(all(miRNA_pw[0].isin(rnacentral_map_human['DB Description'])))
miRNA_pw = pd.merge(miRNA_pw, rnacentral_map_human.rename(columns={'DB Description':0}), on=0).drop(
    columns=[0,1,'DB','DB ID','Organism','RNA category']).rename(columns={'RNAcentral ID':'RNA'})

miRNA_pw['Source'] = 'miRPathDB'
miRNA_pw = miRNA_pw.rename(columns={'RNA':':START_ID',2:':END_ID'})
miRNA_pw.head(n=3)

In [None]:
miRNA_pw = miRNA_pw.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
miRNA_pw[":TYPE"] = "participates_in"
miRNA_pw.to_pickle(unprocessed_edge_data_location+'RNA_participates_in_wikipathways.pkl')
miRNA_pw = miRNA_pw.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
miRNA_pw[":TYPE"] = "has_participant"
miRNA_pw.to_pickle(unprocessed_edge_data_location+'wikipathways_has_participant_RNA.pkl')
miRNA_pw.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0003002 (represses expression of) - Gene

* [Addgene](https://www.addgene.org/)

In [None]:
# copy-paste table from https://www.addgene.org/crispr/reference/grna-sequence/#datatable
gRNA_gene = pd.read_csv(unprocessed_data_location + 'grna_sequences_addgene.txt', sep='\t', dtype = {"Plasmid ID":str})  
gRNA_gene.columns=gRNA_gene.columns.str.rstrip()
gRNA_gene = gRNA_gene[gRNA_gene['Target Species'].notna()]
gRNA_gene = gRNA_gene[gRNA_gene['Target Species'].str.contains('apiens')]
gRNA_gene = gRNA_gene[~gRNA_gene['Plasmid ID'].isna()]
gRNA_gene['Plasmid ID'] = 'www.addgene.org/'+gRNA_gene['Plasmid ID'].astype(str).str.rstrip()
gRNA_gene['Target Gene'] = gRNA_gene['Target Gene'].str.upper().astype(str).str.rstrip()
gRNA_gene.drop(columns=['Target Species','Cas9 Species','Depositor'],inplace=True)

gRNA_gene = gRNA_gene.merge(symbol_entrez_map.rename(columns={0:'Target Gene'}), on='Target Gene').drop(
    columns=['Target Gene']).rename(columns={1:'Target Gene'})

gRNA_gene['PubMed ID'] = gRNA_gene['PubMed ID'].str.replace(' $', '', regex=True)
gRNA_gene['PubMed ID'] = pd.to_numeric(gRNA_gene['PubMed ID'], errors='coerce')
gRNA_gene['PubMed ID'] = gRNA_gene['PubMed ID'].astype(str)
gRNA_gene['PubMed ID'] = gRNA_gene['PubMed ID'].str.replace(".0", "")
gRNA_gene['PubMed ID'] = gRNA_gene['PubMed ID'].replace("nan", np.nan)

gRNA_gene.Application = gRNA_gene.Application.str.replace(' $', '', regex=True)
gRNA_gene['Application'] = gRNA_gene['Application'].str.lower().str.split("/")
gRNA_gene = gRNA_gene.explode('Application')
gRNA_gene = pd.merge(gRNA_gene, method_map, right_on='0_y', left_on='Application', how='left')
gRNA_gene['0_x'] = gRNA_gene['0_x'].fillna(gRNA_gene['Application'])
gRNA_gene = gRNA_gene.drop(columns=['0_y', 'Application'])
gRNA_gene = gRNA_gene.rename(columns={'0_x':'Location'})

gRNA_gene['Source'] = 'Addgene'
gRNA_gene.rename(columns={'Plasmid ID':':START_ID','Target Gene':':END_ID','PubMed ID':'PubMedID'},inplace=True)
gRNA_gene.head(n=3)

* [eSkip-Finder](https://eskip-finder.org/cgi-bin/input.cgi) <br /> eSkip-Finder is the first machine learning-based design tool and database of antisense oligonucleotides (ASOs) for exon skipping. A significant challenge, however, is the difficulty in selecting an optimal target sequence for exon skipping.

In [None]:
# We define a function to fetch the PubMed ID from the reference provided by eSkip-Finder 
Entrez.email = "emanuele.cavalleri@unimi.it"

def fetch_pubmed_id(reference):
    try:
        handle = Entrez.esearch(db="pubmed", term=reference, retmax=1)  
        record = Entrez.read(handle)
        handle.close()
        # Return the first PubMed ID if available
        if record["IdList"]:
            return record["IdList"][0]
        else:
            return "Null"
    except Exception as e:
        return f"Error: {e}"

In [None]:
# https://eskip-finder.org/ --> Search the Database --> Search 'All' on Species=human
ASO_mRNA = pd.read_html(unprocessed_data_location + '.htmeSkip-Finderl')[2]
ASO_mRNA = ASO_mRNA[ASO_mRNA['Species'] == 'human']
ASO_mRNA = ASO_mRNA[ASO_mRNA['Oligo name in literature'] != 'Null']
ASO_mRNA = ASO_mRNA[ASO_mRNA['confidence level (1:describe to explicitly / 0:speculated from context)']=='1']
ASO_mRNA.drop(columns=['Oligo index in literature',"Oligo concentration","Unit for oligo concentration",
                       'Oligo sequence /: Cocktail. -: weasel (connected).',"Electroporation (Yes/No)",
                       'cap of 5 or 3 terminal (Conjugated end is not specified.)',"Use of transfection reagent (Yes/No)",
                       'cap of 5 terminal','cap of 3 terminal',"nested pcr?","Standard type","Standard relation",
                       'Species','Literature info (Patent ID) (original)',"Standard value",
                       'Oligo chemistry','Title','Date','Inventor','Assignee/Applicants',"Alternative/translated literature",
                       "confidence level (1:describe to explicitly / 0:speculated from context)",
                       'Figure/Table in literature','Appendix','Unnamed: 31'],inplace=True)
ASO_mRNA = pd.merge(ASO_mRNA,symbol_entrez_map.rename(columns={0:'Target gene (RNA)'}), on='Target gene (RNA)').drop(
    columns=['Target gene (RNA)']).rename(columns={1:'Gene'})
ASO_mRNA['Oligo name in literature'] = ASO_mRNA['Oligo name in literature'].str.strip()
ASO_mRNA['Oligo name in literature'] = ASO_mRNA['Oligo name in literature'].str.replace(' ', '')
ASO_mRNA['# of oligo'] = ASO_mRNA['# of oligo'].str.replace('Null',"1")
ASO_mRNA['# of oligo'] = ASO_mRNA['# of oligo'].str.replace('unspecified',"1").astype(int)
ASO_mRNA["Reference"] = ASO_mRNA["Reference"].str.strip()
print(ASO_mRNA["Reference"].unique()[:3])
reference_mapping = {ref: fetch_pubmed_id(ref) for ref in ASO_mRNA["Reference"].unique()}
list(reference_mapping.items())[:3]

In [None]:
reference_df = pd.DataFrame.from_dict(reference_mapping, orient="index", columns=["PubMed ID"])
reference_df.index.name = "Reference"
reference_df.reset_index(inplace=True)
ASO_mRNA = ASO_mRNA.merge(reference_df, on="Reference", how="left").drop(columns=["Reference"])

ASO_mRNA['cells used'] = ASO_mRNA['cells used'].str.lower()
ASO_mRNA = pd.merge(ASO_mRNA, location_map, right_on='0_y', left_on='cells used', how='left')
ASO_mRNA['0_x'] = ASO_mRNA['0_x'].fillna(ASO_mRNA['cells used'])
ASO_mRNA = ASO_mRNA.drop(columns=['0_y', 'cells used'])
ASO_mRNA = ASO_mRNA.rename(columns={'0_x':'Location'})
ASO_mRNA['Location'] = ASO_mRNA['Location'].replace('null',np.nan)

ASO_mRNA['Transfection reagent detail (Name, concentration)'] = ASO_mRNA['Transfection reagent detail (Name, concentration)'].str.lower()
ASO_mRNA = pd.merge(ASO_mRNA, method_map, right_on='0_y', left_on='Transfection reagent detail (Name, concentration)', how='left')
ASO_mRNA['0_x'] = ASO_mRNA['0_x'].fillna(ASO_mRNA['Transfection reagent detail (Name, concentration)'])
ASO_mRNA = ASO_mRNA.drop(columns=['0_y', 'Transfection reagent detail (Name, concentration)'])
ASO_mRNA = ASO_mRNA.rename(columns={'0_x':'Method'})
ASO_mRNA['Method'] = ASO_mRNA['Method'].replace('null',np.nan)

ASO_mRNA['Source'] = 'eSkip-Finder'
ASO_mRNA.rename(columns={'Gene':':END_ID', "Oligo name in literature":':START_ID','# of oligo':'Number_of_oligos','Target exon':'Exon'},inplace=True)
ASO_mRNA.head(n=3)

In [None]:
RNA_represses_expression_of_Gene = pd.concat([ASO_mRNA, gRNA_gene])
RNA_represses_expression_of_Gene = RNA_represses_expression_of_Gene.groupby([':START_ID',':END_ID']).agg(
    {'Source':set,'Location':set,'Method':set,'PubMedID':set,'Number_of_oligos':np.mean,'Exon':set}).reset_index()
RNA_represses_expression_of_Gene[":TYPE"] = "represses_expression_of"
RNA_represses_expression_of_Gene.to_pickle(unprocessed_edge_data_location+'RNA_represses_expression_of_gene.pkl')
RNA_represses_expression_of_Gene.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0003002 (represses expression of) - RNA

* [The MIT/ICBP siRNA Database](http://web.mit.edu/sirna/index.html) <br /> The MIT/ICBP siRNA Database has validated siRNA and shRNA sequences against over 100 genes.

In [None]:
ICBP = pd.read_html('http://web.mit.edu/sirna/sirnas-gene.html') # siRNA
ICBP = ICBP[1]
ICBP.columns = ICBP.iloc[[0]].squeeze()
ICBP.drop(0, inplace=True)
ICBP[['ID#']] = ICBP[['ID#']] + '.html'
ICBP.head(n=3)

In [None]:
ICBPsiRNA = ICBP.loc[(ICBP['siRNA'] == 'x') & (ICBP['Human'] == 'x')]
ICBPsiRNA['Protein knockdown'] = ICBPsiRNA['Protein knockdown'].replace('---',np.nan)
ICBPsiRNA['mRNA knockdown'] = ICBPsiRNA['mRNA knockdown'].replace('---',np.nan)
ICBPsiRNA.drop(columns=['siRNA','shRNA','Mouse','Human'],inplace=True)
ICBPsiRNA.head(n=3)

In [None]:
def extract_term_to_search_ids(url):
    # Fetch the HTML content from the URL
    response = requests.get(url)
    html_content = response.text

    # Parse the HTML with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all hyperlinks (assuming the hyperlinks are within <a> tags)
    hyperlinks = []
    for link in soup.find_all('a', href=True):
        hyperlinks.append(link['href'])
    #print(hyperlinks[-1:])
    # Regular expression to find 'TermToSearch=' followed by digits or letters
    pattern = r'(?:[\?&]term=([a-zA-Z0-9]+))|(?:TermToSearch=([a-zA-Z0-9]+))'

    # List to hold extracted IDs
    extracted_ids = []

    # Iterate through the hyperlinks and search for matches
    for link in hyperlinks:
        matches = re.findall(pattern, link)  # Find all matches in the link
        for match in matches:
            # Each match contains two groups; pick the first non-empty one
            extracted_ids.append(match[0] or match[1])

    return extracted_ids if extracted_ids else ['nan']

# Example usage:
url = 'http://web.mit.edu/sirna/sequences/results-1000.html'
ids = extract_term_to_search_ids(url)
print(ids)

In [None]:
def process_sirna_data(ICBPsiRNA):
    """
    Processes a DataFrame of sirna data, fetches, extracts and processes information for each sirna ID.
    """
    # Initialize an empty DataFrame to hold the results
    property_df = pd.DataFrame()

    # Iterate over each unique sirna_id
    for sirna_id in ICBPsiRNA['ID#'].unique():
        
        url = 'http://web.mit.edu/sirna/sequences/results-' + sirna_id
        
        # Read the HTML content and extract the table using pandas
        ICBP = pd.read_html(url)
        #print(url)
        df = ICBP[1]
        df = df.T  # Transpose the dataframe
        df.reset_index(drop=True, inplace=True)
        #print(df)
        df.rename(columns={2:'Target Gene Info',13:'Gene Origin Info'}, inplace=True)
        df=df[['Target Gene Info','Gene Origin Info']]

        # Extract specific information from the columns using regex
        df['Target Gene Info'] = df['Target Gene Info'].str.extract(r'Target gene: (.*)')  # Extract NM_* info
        df['Condition'] = df['Gene Origin Info'].str.extract(r'Cell type: (.*)')
        df['Cell line'] = df['Gene Origin Info'].str.extract(r'Cell line: (.*)')

        # Fill NaN values with empty strings
        df['Target Gene Info'] = df['Target Gene Info'].fillna('')  
        df['Gene Origin Info'] = df['Gene Origin Info'].fillna('')  
        df['Condition'] = df['Condition'].fillna('')  
        df['Cell line'] = df['Cell line'].fillna('')

        # Extract the NM_* gene information
        df['Target Gene Info'] = df['Target Gene Info'].str.extract(r'\((NM_\d+)\)')[0]
        df['Target Gene Info'] = df['Target Gene Info'].fillna('nan')

        # Combine relevant rows into one DataFrame for a clean representation
        df_combined = pd.DataFrame({
            'Target Gene Info': df.iloc[0]['Target Gene Info'] + " " + df.iloc[1]['Target Gene Info'],
            'Gene Origin Info': df.iloc[0]['Gene Origin Info'],  # Take from first row
            'Condition': df.iloc[0]['Condition'],  # Take from first row
            'Cell line': df.iloc[1]['Cell line']  # Take from second row
        }, index=[0])

        # Clean up the entries (remove redundant labels, if needed)
        df_combined['Gene Origin Info'] = df_combined['Gene Origin Info'].replace(r'Cell type: ', '', regex=True)
        df_combined['Cell line'] = df_combined['Cell line'].replace(r'Cell line: ', '', regex=True)
        df_combined.drop(columns=['Gene Origin Info'], inplace=True)

        # Extract the PMID values using the previously defined function
        df_combined['PMID'] = extract_term_to_search_ids(url)
        df_combined['ID#'] = sirna_id
        # Concatenate the result to the main DataFrame
        #print(df_combined)
        property_df = pd.concat([property_df, df_combined])

        # Debug prints (optional)
        
        #

    # Return the final DataFrame containing all the processed data
    return property_df

# Example usage (assuming ICBPsiRNA is already loaded with the appropriate data):
property_df = process_sirna_data(ICBPsiRNA)
property_df['Target Gene Info'] = property_df['Target Gene Info'].str.replace(' nan', '')
property_df['Target Gene Info'] = property_df['Target Gene Info'].replace('nan', np.nan)
property_df['Target Gene Info'] = property_df['Target Gene Info'].str.upper()
property_df.head(n=3)

In [None]:
df = pd.merge(ICBPsiRNA, property_df, on='ID#', how='left')
df['Target Gene Info'] = df['Target Gene Info'].fillna(df['Target Gene'])
ICBPsiRNA = df.drop(columns=['Target Gene','Protein knockdown'])
ICBPsiRNA = ICBPsiRNA[ICBPsiRNA['mRNA knockdown'].notna()]
ICBPsiRNA = pd.merge(ICBPsiRNA, ensembl_map[['transcript_stable_id','symbol']].rename(
    columns={'symbol':'Target Gene Info'}).drop_duplicates(), on='Target Gene Info', how='left')
ICBPsiRNA.transcript_stable_id = ICBPsiRNA.transcript_stable_id.fillna(ICBPsiRNA['Target Gene Info'])
ICBPsiRNA['NCBI Probe #'] = "NCBI/probereport.cgi?uid=" + ICBPsiRNA['NCBI Probe #'].astype(str)
ICBPsiRNA['NCBI Probe #'] = ICBPsiRNA['NCBI Probe #'].replace('NCBI/probereport.cgi?uid=N/A*', np.nan)
ICBPsiRNA['NCBI Probe #'] = ICBPsiRNA['NCBI Probe #'].replace('NCBI/probereport.cgi?uid=nan', np.nan)

ICBPsiRNA = pd.merge(ICBPsiRNA,
                     rnacentral_map_human_refseq[['RNAcentral ID','RefSeq ID']].drop_duplicates().rename(
                                             columns={'RNAcentral ID':'transcript_ensembl',
                                                      'RefSeq ID':'transcript_stable_id'}), on='transcript_stable_id', how='left')
ICBPsiRNA.drop(columns=['transcript_stable_id'],inplace=True)

ICBPsiRNA['PMID'] = pd.to_numeric(ICBPsiRNA['PMID'], errors='coerce')
ICBPsiRNA['PMID'] = ICBPsiRNA['PMID'].astype(str)
ICBPsiRNA['PMID'] = ICBPsiRNA['PMID'].str.replace(".0", "")
ICBPsiRNA['PMID'] = ICBPsiRNA['PMID'].replace("nan", np.nan)

ICBPsiRNA['Condition'] = ICBPsiRNA['Condition'].str.lower().str.strip()
ICBPsiRNA['Condition'] = ICBPsiRNA['Condition'].str.split(" / ")
ICBPsiRNA = ICBPsiRNA.explode('Condition')
ICBPsiRNA = pd.merge(ICBPsiRNA, disease_map, right_on='0_y', left_on='Condition', how='left')
ICBPsiRNA['0_x'] = ICBPsiRNA['0_x'].fillna(ICBPsiRNA['Condition'])
ICBPsiRNA = ICBPsiRNA.drop(columns=['0_y', 'Condition'])
ICBPsiRNA = ICBPsiRNA.rename(columns={'0_x':'Location'})

ICBPsiRNA['Cell line'] = ICBPsiRNA['Cell line'].str.lower().str.strip()
ICBPsiRNA['Cell line'] = ICBPsiRNA['Cell line'].str.split(" / ")
ICBPsiRNA = ICBPsiRNA.explode('Cell line')
ICBPsiRNA = pd.merge(ICBPsiRNA, location_map, right_on='0_y', left_on='Cell line', how='left')
ICBPsiRNA['0_x'] = ICBPsiRNA['0_x'].fillna(ICBPsiRNA['Cell line'])
ICBPsiRNA = ICBPsiRNA.drop(columns=['0_y', 'Cell line'])
ICBPsiRNA = ICBPsiRNA.rename(columns={'0_x':'Location2', 'NCBI Probe #':'Location3', 'PMID':'PubMedID'})

ICBPsiRNA = pd.concat([ICBPsiRNA.drop(columns=['Location2', 'Location3']),
    ICBPsiRNA.drop(columns=['Location', 'Location3']),
    ICBPsiRNA.drop(columns=['Location', 'Location2'])])
ICBPsiRNA['Location'] = ICBPsiRNA['Location'].fillna(ICBPsiRNA['Location2'])
ICBPsiRNA['Location'] = ICBPsiRNA['Location'].fillna(ICBPsiRNA['Location3'])

ICBPsiRNA['Source'] = 'ICBP_MIT_siRNA'
ICBPsiRNA.rename(columns={'ID#':':START_ID','transcript_ensembl':':END_ID','mRNA knockdown':'Knockdown_percentage'},inplace=True)
ICBPsiRNA.head(n=3)

In [None]:
ICBPshRNA = ICBP.loc[(ICBP['shRNA'] == 'x') & (ICBP['Human'] == 'x')] # shRNA
ICBPshRNA['Protein knockdown'] = ICBPshRNA['Protein knockdown'].replace('---',np.nan)
ICBPshRNA['mRNA knockdown'] = ICBPshRNA['mRNA knockdown'].replace('---',np.nan)
ICBPshRNA.drop(columns=['siRNA','shRNA','Mouse','Human'],inplace=True)
ICBPshRNA.head(n=3)

In [None]:
def process_shrna_data(ICBPsiRNA):
    """
    Processes a DataFrame of sirna data, fetches, extracts and processes information for each sirna ID.
    """
    # Initialize an empty DataFrame to hold the results
    property_df = pd.DataFrame()

    # Iterate over each unique sirna_id
    for sirna_id in ICBPsiRNA['ID#'].unique():
        
        url = 'http://web.mit.edu/sirna/sequences/results-' +sirna_id
        
        # Read the HTML content and extract the table using pandas
        ICBP = pd.read_html(url)
        #print(url)
        df = ICBP[1]
        df = df.T  # Transpose the dataframe
        df.reset_index(drop=True, inplace=True)
        #print(df)
        df.rename(columns={2:'Target Gene Info',4:"Target sequence",13:'Gene Origin Info'}, inplace=True)
        df=df[['Target Gene Info','Gene Origin Info',"Target sequence"]]

        # Extract specific information from the columns using regex
        df['Target Gene Info'] = df['Target Gene Info'].str.extract(r'Target gene: (.*)')  # Extract NM_* info
        df['Target sequence'] = df['Target sequence'].str.extract(r'Target sequence: (.*)')  # Extract NM_* info
        df['Condition'] = df['Gene Origin Info'].str.extract(r'Cell type: (.*)')
        df['Cell line'] = df['Gene Origin Info'].str.extract(r'Cell line: (.*)')

        # Fill NaN values with empty strings
        df['Target Gene Info'] = df['Target Gene Info'].fillna('')  
        df['Gene Origin Info'] = df['Gene Origin Info'].fillna('')  
        df['Condition'] = df['Condition'].fillna('')  
        df['Cell line'] = df['Cell line'].fillna('')

        # Extract the NM_* gene information
        df['Target Gene Info'] = df['Target Gene Info'].str.extract(r'\((NM_\d+)\)')[0]
        df['Target Gene Info'] = df['Target Gene Info'].fillna('nan')

        # Combine relevant rows into one DataFrame for a clean representation
        df_combined = pd.DataFrame({
            'Target Gene Info': df.iloc[0]['Target Gene Info'] + " " + df.iloc[1]['Target Gene Info'],
            'Gene Origin Info': df.iloc[0]['Gene Origin Info'],  # Take from first row
            'Target sequence': df.iloc[0]['Target sequence'],
            'Condition': df.iloc[0]['Condition'],  # Take from first row
            'Cell line': df.iloc[1]['Cell line']  # Take from second row
        }, index=[0])

        # Clean up the entries (remove redundant labels, if needed)
        df_combined['Gene Origin Info'] = df_combined['Gene Origin Info'].replace(r'Cell type: ', '', regex=True)
        df_combined['Cell line'] = df_combined['Cell line'].replace(r'Cell line: ', '', regex=True)
        df_combined.drop(columns=['Gene Origin Info'], inplace=True)

        # Extract the PMID values using the previously defined function
        df_combined['PMID'] = extract_term_to_search_ids(url)
        df_combined['ID#'] = sirna_id
        # Concatenate the result to the main DataFrame
        #print(df_combined)
        property_df = pd.concat([property_df, df_combined])

    # Return the final DataFrame containing all the processed data
    return property_df

# Example usage (assuming ICBPsiRNA is already loaded with the appropriate data):
property_df = process_shrna_data(ICBPshRNA)
property_df['Target Gene Info'] = property_df['Target Gene Info'].str.replace(' nan', '')
property_df['Target Gene Info'] = property_df['Target Gene Info'].replace('nan', np.nan)
property_df['Target Gene Info'] = property_df['Target Gene Info'].str.upper()
property_df.head(n=3)

In [None]:
df = pd.merge(ICBPshRNA, property_df, on='ID#', how='left')
df['Target Gene Info'] = df['Target Gene Info'].fillna(df['Target Gene'])
ICBPshRNA = df.drop(columns=['Target Gene','Protein knockdown'])
ICBPshRNA = ICBPshRNA[ICBPshRNA['mRNA knockdown'].notna()]
ICBPshRNA = pd.merge(ICBPshRNA, ensembl_map[['transcript_stable_id','symbol']].rename(
    columns={'symbol':'Target Gene Info'}).drop_duplicates(), on='Target Gene Info')

ICBPshRNA['NCBI Probe #'] = "NCBI/probereport.cgi?uid=" + ICBPshRNA['NCBI Probe #'].astype(str)
ICBPshRNA['NCBI Probe #'] = ICBPshRNA['NCBI Probe #'].replace('NCBI/probereport.cgi?uid=N/A*', np.nan)
ICBPshRNA['NCBI Probe #'] = ICBPshRNA['NCBI Probe #'].replace('NCBI/probereport.cgi?uid=nan', np.nan)

ICBPshRNA = pd.merge(ICBPshRNA,
                     rnacentral_map_human_refseq[['RNAcentral ID','RefSeq ID']].drop_duplicates().rename(
                                             columns={'RNAcentral ID':'transcript_ensembl',
                                                      'RefSeq ID':'transcript_stable_id'}), on='transcript_stable_id', how='left')
ICBPshRNA.drop(columns=['transcript_stable_id'],inplace=True)

ICBPshRNA['PMID'] = pd.to_numeric(ICBPshRNA['PMID'], errors='coerce')
ICBPshRNA['PMID'] = ICBPshRNA['PMID'].astype(str)
ICBPshRNA['PMID'] = ICBPshRNA['PMID'].str.replace(".0", "")
ICBPshRNA['PMID'] = ICBPshRNA['PMID'].replace("nan", np.nan)

ICBPshRNA['Condition'] = ICBPshRNA['Condition'].str.lower().str.strip()
ICBPshRNA['Condition'] = ICBPshRNA['Condition'].str.split(" / ")
ICBPshRNA = ICBPshRNA.explode('Condition')
ICBPshRNA = pd.merge(ICBPshRNA, disease_map, right_on='0_y', left_on='Condition', how='left')
ICBPshRNA['0_x'] = ICBPshRNA['0_x'].fillna(ICBPshRNA['Condition'])
ICBPshRNA = ICBPshRNA.drop(columns=['0_y', 'Condition'])
ICBPshRNA = ICBPshRNA.rename(columns={'0_x':'Location'})

ICBPshRNA['Cell line'] = ICBPshRNA['Cell line'].str.lower().str.strip()
ICBPshRNA['Cell line'] = ICBPshRNA['Cell line'].str.split(" / ")
ICBPshRNA = ICBPshRNA.explode('Cell line')
ICBPshRNA = pd.merge(ICBPshRNA, location_map, right_on='0_y', left_on='Cell line', how='left')
ICBPshRNA['0_x'] = ICBPshRNA['0_x'].fillna(ICBPshRNA['Cell line'])
ICBPshRNA = ICBPshRNA.drop(columns=['0_y', 'Cell line'])
ICBPshRNA = ICBPshRNA.rename(columns={'0_x':'Location2', 'NCBI Probe #':'Location3'})

ICBPshRNA = pd.concat([ICBPshRNA.drop(columns=['Location2', 'Location3']),
    ICBPshRNA.drop(columns=['Location', 'Location3']),
    ICBPshRNA.drop(columns=['Location', 'Location2'])])
ICBPshRNA['Location'] = ICBPshRNA['Location'].fillna(ICBPshRNA['Location2'])
ICBPshRNA['Location'] = ICBPshRNA['Location'].fillna(ICBPshRNA['Location3'])

ICBPshRNA['Source'] = 'ICBP_MIT_siRNA'
ICBPshRNA.rename(columns={'ID#':':START_ID','transcript_ensembl':':END_ID','mRNA knockdown':'Knockdown_percentage',
                          'PMID':'PubMedID'},inplace=True)
ICBPshRNA.head(n=3)

In [None]:
RNA_represses_expression_of_RNA = pd.concat([ICBPsiRNA, ICBPshRNA])
RNA_represses_expression_of_RNA = RNA_represses_expression_of_RNA.groupby([':START_ID',':END_ID']).agg(
    {'Source':set,'Location':set,'PubMedID':set,'Knockdown_percentage':set}).reset_index()
RNA_represses_expression_of_RNA[":TYPE"] = "represses_expression_of"
RNA_represses_expression_of_RNA.to_pickle(unprocessed_edge_data_location+'RNA_represses_expression_of_RNA.pkl')
RNA_represses_expression_of_RNA.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0011007 (decreases by repression quantity of) - OBO

* [The MIT/ICBP siRNA Database](http://web.mit.edu/sirna/index.html)

In [None]:
ICBPsiRNA_protein = ICBP.loc[(ICBP['siRNA'] == 'x') & (ICBP['Human'] == 'x')] # siRNA -- PRO
ICBPsiRNA_protein['Protein knockdown'] = ICBPsiRNA_protein['Protein knockdown'].replace('---',np.nan)
ICBPsiRNA_protein['mRNA knockdown'] = ICBPsiRNA_protein['mRNA knockdown'].replace('---',np.nan)
ICBPsiRNA_protein.drop(columns=['siRNA','shRNA','Mouse','Human'],inplace=True)
ICBPsiRNA_protein = ICBPsiRNA_protein[ICBPsiRNA_protein['Protein knockdown'].notna()]
ICBPsiRNA_protein.head(n=3)

In [None]:
property_df = process_sirna_data(ICBPsiRNA_protein)
property_df['Target Gene Info'] = property_df['Target Gene Info'].str.replace(' nan', '')
property_df['Target Gene Info'] = property_df['Target Gene Info'].replace('nan', np.nan)
property_df['Target Gene Info'] = property_df['Target Gene Info'].str.upper()

df = pd.merge(ICBPsiRNA_protein, property_df, on='ID#', how='left')
df['Target Gene Info'] = df['Target Gene Info'].fillna(df['Target Gene'])
ICBPsiRNA_protein = df.drop(columns=['Target Gene','mRNA knockdown'])
ICBPsiRNA_protein = pd.merge(ICBPsiRNA_protein, symbol_to_pro.rename(columns={0:'Target Gene Info'}),
    on='Target Gene Info').drop(columns=['Target Gene Info']).rename(columns={1:'Protein'})

ICBPsiRNA_protein['NCBI Probe #'] = "NCBI/probereport.cgi?uid=" + ICBPsiRNA_protein['NCBI Probe #'].astype(str)
ICBPsiRNA_protein['NCBI Probe #'] = ICBPsiRNA_protein['NCBI Probe #'].replace('NCBI/probereport.cgi?uid=N/A*', np.nan)
ICBPsiRNA_protein['NCBI Probe #'] = ICBPsiRNA_protein['NCBI Probe #'].replace('NCBI/probereport.cgi?uid=nan', np.nan)

ICBPsiRNA_protein['PMID'] = pd.to_numeric(ICBPsiRNA_protein['PMID'], errors='coerce')
ICBPsiRNA_protein['PMID'] = ICBPsiRNA_protein['PMID'].astype(str)
ICBPsiRNA_protein['PMID'] = ICBPsiRNA_protein['PMID'].str.replace(".0", "")
ICBPsiRNA_protein['PMID'] = ICBPsiRNA_protein['PMID'].replace("nan", np.nan)

ICBPsiRNA_protein['Condition'] = ICBPsiRNA_protein['Condition'].str.lower().str.strip()
ICBPsiRNA_protein['Condition'] = ICBPsiRNA_protein['Condition'].str.split(" / ")
ICBPsiRNA_protein = ICBPsiRNA_protein.explode('Condition')
ICBPsiRNA_protein = pd.merge(ICBPsiRNA_protein, disease_map, right_on='0_y', left_on='Condition', how='left')
ICBPsiRNA_protein['0_x'] = ICBPsiRNA_protein['0_x'].fillna(ICBPsiRNA_protein['Condition'])
ICBPsiRNA_protein = ICBPsiRNA_protein.drop(columns=['0_y', 'Condition'])
ICBPsiRNA_protein = ICBPsiRNA_protein.rename(columns={'0_x':'Location'})

ICBPsiRNA_protein['Cell line'] = ICBPsiRNA_protein['Cell line'].str.lower().str.strip()
ICBPsiRNA_protein = pd.merge(ICBPsiRNA_protein, location_map, right_on='0_y', left_on='Cell line', how='left')
ICBPsiRNA_protein['0_x'] = ICBPsiRNA_protein['0_x'].fillna(ICBPsiRNA_protein['Cell line'])
ICBPsiRNA_protein = ICBPsiRNA_protein.drop(columns=['0_y', 'Cell line'])
ICBPsiRNA_protein = ICBPsiRNA_protein.rename(columns={'0_x':'Location2', 'NCBI Probe #':'Location3'})

ICBPsiRNA_protein = pd.concat([ICBPsiRNA_protein.drop(columns=['Location2', 'Location3']),
    ICBPsiRNA_protein.drop(columns=['Location', 'Location3']),
    ICBPsiRNA_protein.drop(columns=['Location', 'Location2'])])
ICBPsiRNA_protein['Location'] = ICBPsiRNA_protein['Location'].fillna(ICBPsiRNA_protein['Location2'])
ICBPsiRNA_protein['Location'] = ICBPsiRNA_protein['Location'].fillna(ICBPsiRNA_protein['Location3'])

ICBPsiRNA_protein['Source'] = 'ICBP_MIT_siRNA'
ICBPsiRNA_protein.rename(columns={'ID#':':START_ID','Protein':':END_ID',
                                  'Protein knockdown':'Knockdown_percentage', 'PMID':'PubMedID'},inplace=True)
ICBPsiRNA_protein.head(n=3)

In [None]:
ICBPshRNA_protein = ICBP.loc[(ICBP['shRNA'] == 'x') & (ICBP['Human'] == 'x')] # shRNA -- PRO
ICBPshRNA_protein['Protein knockdown'] = ICBPshRNA_protein['Protein knockdown'].replace('---',np.nan)
ICBPshRNA_protein['mRNA knockdown'] = ICBPshRNA_protein['mRNA knockdown'].replace('---',np.nan)
ICBPshRNA_protein.drop(columns=['siRNA','shRNA','Mouse','Human'],inplace=True)
ICBPshRNA_protein = ICBPshRNA_protein[ICBPshRNA_protein['Protein knockdown'].notna()]
# Empty df for shRNA-protein
ICBPshRNA_protein.head(n=3)

In [None]:
ICBPsiRNA_protein = pd.concat([ICBPsiRNA_protein, ICBPshRNA_protein])
ICBPsiRNA_protein = ICBPsiRNA_protein.groupby([':START_ID',':END_ID']).agg(
    {'Source':set,'Location':set,'PubMedID':set,'Knockdown_percentage':set}).reset_index() 
ICBPsiRNA_protein[":TYPE"] = "decreases_by_repression_quantity_of"
ICBPsiRNA_protein.to_pickle(unprocessed_edge_data_location+'RNA_decreases_by_repression_quantity_of_OBO.pkl')
ICBPsiRNA_protein.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002430 (involved in negative regulation of) - RNA

* [DrugBank](https://go.drugbank.com/categories/DBCAT001709) <br /> DrugBank is a comprehensive, free-to-access, online database containing information on drugs and drug targets. As both a bioinformatics and a cheminformatics resource, it combines detailed drug (i.e. chemical, pharmacological and pharmaceutical) data with comprehensive drug target (i.e. sequence, structure, and pathway) information.

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001709
ASO_mRNA_prot_drugbank = pd.read_csv(processed_data_location + 'DrugBank/ASO-gene_DrugBank.txt', sep='\t') 
ASO_mRNA_prot_drugbank['Drug'] = ASO_mRNA_prot_drugbank['Drug'].str.lower()
ASO_mRNA_prot_drugbank = pd.merge(ASO_mRNA_prot_drugbank, desc_chebi_map.rename(columns={0:'Drug'}), on='Drug').rename(columns={1:'Chemical'})
ASO_mRNA_prot_drugbank['Target'] = ASO_mRNA_prot_drugbank['Target'].str.lower()
ASO_mRNA_prot_drugbank['Target'] = ASO_mRNA_prot_drugbank['Target'].str.replace("superoxide dismutase 1 (sod1)","hsod1")
ASO_mRNA_prot_drugbank['Target'] = ASO_mRNA_prot_drugbank['Target'].str.replace("5-aminolevulinate synthase, non-specific, mitochondrial","halas1")
ASO_mRNA_prot_drugbank = pd.merge(ASO_mRNA_prot_drugbank, desc_pro_map.rename(columns={0:'Target'}), on='Target',how='left').rename(
    columns={1:'Protein'})
exon_mapping = {
    'dmd gene (exon 53 viltolarsen target site)': '53',
    'dmd gene (exon 45 casimersen target site)': '45',
    'dmd-001 gene (exon 51 target site)': '51'
}
ASO_mRNA_prot_drugbank['Exon'] = ASO_mRNA_prot_drugbank['Target'].map(exon_mapping)
ASO_mRNA_prot_drugbank['Target'] = ASO_mRNA_prot_drugbank['Target'].str.replace("dmd gene (exon 53 viltolarsen target site)","DMD")
ASO_mRNA_prot_drugbank['Target'] = ASO_mRNA_prot_drugbank['Target'].str.replace("dmd gene (exon 45 casimersen target site)","DMD")
ASO_mRNA_prot_drugbank['Target'] = ASO_mRNA_prot_drugbank['Target'].str.replace("dmd-001 gene (exon 51 target site)","DMD")
ASO_mRNA_prot_drugbank['Target'] = ASO_mRNA_prot_drugbank['Target'].str.replace("mrna of apob-100","APOB")
ASO_mRNA_prot_drugbank['Target'] = ASO_mRNA_prot_drugbank['Target'].str.replace('transthyretin mrna',"TTR")
ASO_mRNA_prot_drugbank['Target'] = ASO_mRNA_prot_drugbank['Target'].str.replace('alas1 mrna',"ALAS1")

aso_ensembl_map = ensembl_map[['transcript_stable_id','symbol','ensembl_transcript_type']].rename(columns={'symbol':'Target'})
aso_ensembl_map = aso_ensembl_map[aso_ensembl_map['ensembl_transcript_type'] == 'protein_coding']
aso_ensembl_map = aso_ensembl_map.drop(columns=['ensembl_transcript_type']).drop_duplicates()

ASO_mRNA_prot_drugbank = pd.merge(ASO_mRNA_prot_drugbank, aso_ensembl_map, on='Target',how='left').rename(
    columns={'transcript_stable_id':'RNA'}).drop(columns=['Target']).drop_duplicates()
ASO_mRNA_prot_drugbank['Source'] = 'DrugBank'
ASO_mRNA_prot_drugbank.head(n=3)

In [None]:
ASO_mRNA_drugbank = ASO_mRNA_prot_drugbank[ASO_mRNA_prot_drugbank['RNA'].notna()].drop(columns=['Protein'])
print(ASO_mRNA_drugbank['Type'].unique())
ASO_mRNA_drugbank = ASO_mRNA_drugbank.drop(columns=['Type']).drop_duplicates()
ASO_mRNA_drugbank.rename(columns={'RNA':':END_ID', 'Chemical':':START_ID'},inplace=True)
ASO_mRNA_drugbank.head(n=3)

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT005484
siRNA_mRNA_prot_drugbank = pd.read_csv(processed_data_location + 'DrugBank/siRNA-gene_DrugBank.txt', sep='\t') 
siRNA_mRNA_prot_drugbank['Drug'] = siRNA_mRNA_prot_drugbank['Drug'].str.lower()
siRNA_mRNA_prot_drugbank = pd.merge(siRNA_mRNA_prot_drugbank, desc_chebi_map.rename(columns={0:'Drug'}), on='Drug', how='left').rename(
    columns={1:'Chemical'})
siRNA_mRNA_prot_drugbank['Target'] = siRNA_mRNA_prot_drugbank['Target'].str.lower()
siRNA_mRNA_prot_drugbank['Target'] = siRNA_mRNA_prot_drugbank['Target'].str.replace("5-aminolevulinate synthase, non-specific, mitochondrial",
                                                                                    "halas1")
siRNA_mRNA_prot_drugbank['Target'] = siRNA_mRNA_prot_drugbank['Target'].str.replace("alpha1-acid glycoprotein",
                                                                                    "alpha-1-acid glycoprotein 2,alpha-1-acid glycoprotein 1")
siRNA_mRNA_prot_drugbank['Target'] = siRNA_mRNA_prot_drugbank['Target'].str.split(",")
siRNA_mRNA_prot_drugbank = siRNA_mRNA_prot_drugbank.explode('Target')
siRNA_mRNA_prot_drugbank = pd.merge(siRNA_mRNA_prot_drugbank, desc_pro_map.rename(columns={0:'Target'}), on='Target',how='left').rename(
    columns={1:'Protein'})

siRNA_mRNA_prot_drugbank['Target'] = siRNA_mRNA_prot_drugbank['Target'].str.replace('transthyretin mrna',"TTR")
siRNA_mRNA_prot_drugbank['Target'] = siRNA_mRNA_prot_drugbank['Target'].str.replace('alas1 mrna',"ALAS1")

aso_ensembl_map = ensembl_map[['transcript_stable_id','symbol','ensembl_transcript_type']].rename(columns={'symbol':'Target'})
aso_ensembl_map = aso_ensembl_map[aso_ensembl_map['ensembl_transcript_type'] == 'protein_coding']
aso_ensembl_map = aso_ensembl_map.drop(columns=['ensembl_transcript_type']).drop_duplicates()

siRNA_mRNA_prot_drugbank = pd.merge(siRNA_mRNA_prot_drugbank, aso_ensembl_map, on='Target',how='left').rename(
    columns={'transcript_stable_id':'RNA'}).drop(columns=['Target']).drop_duplicates()

siRNA_mRNA_prot_drugbank['Source'] = 'DrugBank'
siRNA_mRNA_prot_drugbank.head(n=3)

In [None]:
siRNA_mRNA_drugbank = siRNA_mRNA_prot_drugbank[siRNA_mRNA_prot_drugbank['RNA'].notna()].drop(columns=['Protein'])
print(siRNA_mRNA_drugbank['Type'].unique())
siRNA_mRNA_drugbank = siRNA_mRNA_drugbank.drop(columns=['Type']).drop_duplicates()
siRNA_mRNA_drugbank.rename(columns={'RNA':':END_ID', 'Chemical':':START_ID'},inplace=True)
siRNA_mRNA_drugbank.head(n=3)

In [None]:
OBO_involved_in_negative_regulation_of_RNA = pd.concat([ASO_mRNA_drugbank, siRNA_mRNA_drugbank])
OBO_involved_in_negative_regulation_of_RNA = OBO_involved_in_negative_regulation_of_RNA.groupby([':START_ID',':END_ID']).agg(
    {'Source':set,'Exon':set}).reset_index()
OBO_involved_in_negative_regulation_of_RNA[":TYPE"] = "involved_in_negative_regulation_of"
OBO_involved_in_negative_regulation_of_RNA.to_pickle(unprocessed_edge_data_location+'OBO_involved_in_negative_regulation_of_RNA.pkl')
OBO_involved_in_negative_regulation_of_RNA.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0011007 (decreases by repression quantity of) - OBO

* [DrugBank](https://go.drugbank.com/categories/DBCAT001709)

In [None]:
ASO_pro_drugbank = ASO_mRNA_prot_drugbank[ASO_mRNA_prot_drugbank['Protein'].notna()].drop(columns=['RNA',"Exon"])# DrugBank -- PRO
print(ASO_pro_drugbank['Type'].unique())
ASO_pro_drugbank_down = ASO_pro_drugbank[(ASO_pro_drugbank['Type'] == 'target') |
                                         (ASO_pro_drugbank['Type'] == 'enzyme')].drop(columns=['Type']).drop_duplicates()
ASO_pro_drugbank_carrier = ASO_pro_drugbank[ASO_pro_drugbank['Type'] == 'carrier'].drop(columns=['Type']).drop_duplicates()
ASO_pro_drugbank_down = ASO_pro_drugbank_down.rename(columns={'Protein':':END_ID', 'Chemical':':START_ID'})
ASO_pro_drugbank_down.head(n=3)

In [None]:
siRNA_pro_drugbank = siRNA_mRNA_prot_drugbank[siRNA_mRNA_prot_drugbank['Protein'].notna()].drop(columns=['RNA'])
print(siRNA_pro_drugbank['Type'].unique())
siRNA_pro_drugbank_down = siRNA_pro_drugbank[(siRNA_pro_drugbank['Type'] == 'target') |
                                         (siRNA_pro_drugbank['Type'] == 'enzyme')].drop(columns=['Type']).drop_duplicates()
siRNA_pro_drugbank_carrier = siRNA_pro_drugbank[siRNA_pro_drugbank['Type'] == 'carrier'].drop(columns=['Type']).drop_duplicates()
siRNA_pro_drugbank_down = siRNA_pro_drugbank_down.rename(columns={'Protein':':END_ID', 'Chemical':':START_ID'})
siRNA_pro_drugbank_down.head(n=3)

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001641
aptamer_protein = pd.read_csv(processed_data_location + 'DrugBank/aptamer-gene_DrugBank.txt', sep='\t') 
print(aptamer_protein['Type'].unique())
aptamer_protein.drop(columns=['Type'],inplace=True)
aptamer_protein['Drug'] = aptamer_protein['Drug'].str.lower()
aptamer_protein = pd.merge(aptamer_protein, desc_chebi_map.rename(columns={0:'Drug'}), on='Drug', how='left').rename(
    columns={1:'Chemical'})
aptamer_protein['Target'] = aptamer_protein['Target'].str.lower()
aptamer_protein = pd.merge(aptamer_protein, desc_pro_map.rename(columns={0:'Target'}), on='Target',how='left').rename(columns={1:'Protein'})
aptamer_protein['Source'] = 'DrugBank'
aptamer_protein = aptamer_protein.rename(columns={'Protein':':END_ID', 'Chemical':':START_ID'}).drop_duplicates()
aptamer_protein.head(n=3)

In [None]:
OBO_decreases_by_repression_quantity_of_OBO = pd.concat([ASO_pro_drugbank_down, siRNA_pro_drugbank_down, aptamer_protein])
OBO_decreases_by_repression_quantity_of_OBO = OBO_decreases_by_repression_quantity_of_OBO.groupby([':START_ID',':END_ID']).agg(
    {'Source':set}).reset_index()
OBO_decreases_by_repression_quantity_of_OBO[":TYPE"] = "decreases_by_repression_quantity_of"
OBO_decreases_by_repression_quantity_of_OBO.to_pickle(unprocessed_edge_data_location+'OBO_decreases_by_repression_quantity_of_OBO.pkl')
OBO_decreases_by_repression_quantity_of_OBO.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0010002 (is carrier of) - OBO

* DrugBank

In [None]:
ASO_pro_drugbank_carrier = ASO_pro_drugbank_carrier.rename(columns={'Protein':':END_ID', 'Chemical':':START_ID'})
ASO_pro_drugbank_carrier

In [None]:
siRNA_pro_drugbank_carrier = siRNA_pro_drugbank_carrier.rename(columns={'Protein':':END_ID', 'Chemical':':START_ID'})
siRNA_pro_drugbank_carrier.head(n=3)

In [None]:
OBO_is_carrier_of_OBO = pd.concat([ASO_pro_drugbank_carrier, siRNA_pro_drugbank_carrier])
OBO_is_carrier_of_OBO = OBO_is_carrier_of_OBO.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
OBO_is_carrier_of_OBO[":TYPE"] = "is_carrier_of"
OBO_is_carrier_of_OBO.to_pickle(unprocessed_edge_data_location+'OBO_is_carrier_of_OBO.pkl')

OBO_generically_depends_on_OBO = OBO_is_carrier_of_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_generically_depends_on_OBO[":TYPE"] = "generically_depends_on"
OBO_generically_depends_on_OBO.to_pickle(unprocessed_edge_data_location+'OBO_generically_depends_on_OBO.pkl')
OBO_generically_depends_on_OBO.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002606 (is substance that treats) - OBO

* CTD

In [None]:
data_downloader("https://ctdbase.org/reports/CTD_chemicals_diseases.tsv.gz", unprocessed_data_location)

In [None]:
ctd_disease = pd.read_csv(unprocessed_data_location+'CTD_chemicals_diseases.tsv', sep='\t', comment="#",
                          names=['ChemicalName','ChemicalID','CasRN','DiseaseName','DiseaseID','DirectEvidence','InferenceGeneSymbol',
                                 'InferenceScore','OmimIDs','PubMedIDs'])
ctd_disease = ctd_disease[ctd_disease['DirectEvidence'].notna()]
print(ctd_disease['InferenceGeneSymbol'].unique())
print(ctd_disease['InferenceScore'].unique())
ctd_disease = ctd_disease[['ChemicalID','DiseaseID','PubMedIDs']]
ctd_disease['ChemicalID'] = "MESH_" + ctd_disease['ChemicalID']
ctd_disease['DiseaseID'] = ctd_disease['DiseaseID'].str.split(':').str[-1]

ctd_disease['PubMedIDs'] = pd.to_numeric(ctd_disease['PubMedIDs'], errors='coerce')
ctd_disease['PubMedIDs'] = ctd_disease['PubMedIDs'].astype(str)
ctd_disease['PubMedIDs'] = ctd_disease['PubMedIDs'].str.replace(".0", "")
ctd_disease['PubMedIDs'] = ctd_disease['PubMedIDs'].replace("nan", np.nan)
ctd_disease['Source'] = 'CTD'

ctd_disease.head(n=3)

In [None]:
ctd_disease = pd.merge(ctd_disease, disgenet_mondo_hpo_map.rename(columns={0:'DiseaseID'}), on='DiseaseID')
ctd_disease = pd.merge(ctd_disease, mesh_to_chebi.rename(columns={0:'ChemicalID'}), on='ChemicalID')
ctd_disease = ctd_disease.rename(columns={'1_y':':START_ID','1_x':':END_ID','PubMedIDs':'PubMedID'})
ctd_disease.head(n=3)

* [DrugBank](https://go.drugbank.com/categories/DBCAT001709)

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001709
ASO_disease = pd.read_csv(processed_data_location + 'DrugBank/ASO-disease_DrugBank.txt', sep='\t') # DrugBank -- Mondo
ASO_disease['Drug'] = ASO_disease['Drug'].str.lower()
ASO_disease = pd.merge(ASO_disease, desc_chebi_map.rename(columns={0:'Drug'}), on='Drug', how='left').rename(columns={1:'Chemical'})
ASO_disease.at[10,"Chemical"] = 'DB05572'
ASO_disease.head(n=3)

In [None]:
ASO_disease['Disease']=["MONDO_0018328",
"MONDO_0001657,MONDO_0007254",
"MONDO_0000878",
"Not Annotated",
"MONDO_0010679",
"MONDO_0001516",
"MONDO_0007100",
"MONDO_0002520",
"MONDO_0010679",
"MONDO_0010679",
"MONDO_0004979",
"MONDO_0010679",
"MONDO_0018634",
"MONDO_0005144",
"Not Annotated",
"MONDO_0007100"]
ASO_disease['Disease'] = ASO_disease.Disease.str.split(',')
ASO_disease = ASO_disease.explode('Disease')
ASO_disease.drop(columns=['Drug Description'],inplace=True)
ASO_disease['Source'] = 'DrugBank'
ASO_disease = ASO_disease[ASO_disease['Disease'] != 'Not Annotated']
ASO_disease.rename(columns={'Disease':':END_ID','Chemical':':START_ID'},inplace=True)
ASO_disease.head(n=3)

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT005484
siRNA_disease = pd.read_csv(processed_data_location + 'DrugBank/siRNA-disease_DrugBank.txt', sep='\t') 
siRNA_disease['Drug'] = siRNA_disease['Drug'].str.lower()
siRNA_disease = pd.merge(siRNA_disease, desc_chebi_map.rename(columns={0:'Drug'}), on='Drug', how='left').rename(columns={1:'Chemical'})
siRNA_disease.head(n=3)

In [None]:
siRNA_disease['Disease']=['MONDO_0017132,MONDO_0001824',
                        'MONDO_0002520',
                        'MONDO_0009823',
                        'MONDO_0017132,MONDO_0001824']

siRNA_disease['Disease'] = siRNA_disease.Disease.str.split(',')
siRNA_disease = siRNA_disease.explode('Disease')
siRNA_disease.drop(columns=['Drug Description'],inplace=True)
siRNA_disease['Source'] = 'DrugBank'
siRNA_disease.rename(columns={'Disease':':END_ID','Chemical':':START_ID'},inplace=True)
siRNA_disease.head(n=3)

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001641
aptamer_disease = pd.read_csv(processed_data_location + 'DrugBank/aptamer-disease_DrugBank.txt', sep='\t') 
aptamer_disease['Drug'] = aptamer_disease['Drug'].str.lower()
aptamer_disease = pd.merge(aptamer_disease, desc_chebi_map.rename(columns={0:'Drug'}), on='Drug', how='left').rename(columns={1:'Chemical'})
aptamer_disease['Source'] = 'DrugBank'
aptamer_disease

In [None]:
aptamer_disease['Disease']=['MONDO_0019514', 'MONDO_0004992,MONDO_0002367,MONDO_0004643,MONDO_0009831','MONDO_0005150']
aptamer_disease['Disease'] = aptamer_disease.Disease.str.split(',')
aptamer_disease = aptamer_disease.explode('Disease')
aptamer_disease.drop(columns=['Drug Description'],inplace=True)
aptamer_disease.rename(columns={'Disease':':END_ID','Chemical':':START_ID'},inplace=True)
aptamer_disease.head(n=3)

In [None]:
OBO_is_substance_that_treats_OBO = pd.concat([ctd_disease, ASO_disease, siRNA_disease, aptamer_disease])
OBO_is_substance_that_treats_OBO = OBO_is_substance_that_treats_OBO.groupby([':START_ID',':END_ID']).agg(
    {'Source':set,'PubMedID':set}).reset_index()
OBO_is_substance_that_treats_OBO[":TYPE"] = "is_substance_that_treats"
OBO_is_substance_that_treats_OBO.to_pickle(unprocessed_edge_data_location+'OBO_is_substance_that_treats_OBO.pkl')

OBO_is_treated_by_substance_OBO = OBO_is_substance_that_treats_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_is_treated_by_substance_OBO[":TYPE"] = "is_treated_by_substance"
OBO_is_treated_by_substance_OBO.to_pickle(unprocessed_edge_data_location+'OBO_is_treated_by_substance_OBO.pkl')
OBO_is_treated_by_substance_OBO.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002599 (capable of inhibiting or preventing pathological process) - OBO

* DrugBank

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT005631
mRNAv_disease = pd.read_csv(processed_data_location + 'DrugBank/mRNAv-disease_DrugBank.txt', sep='\t') 
mRNAv_disease['Drug'] = mRNAv_disease['Drug'].str.lower()
mRNAv_disease = pd.merge(mRNAv_disease, desc_chebi_map.rename(columns={0:'Drug'}), on='Drug', how='left').rename(columns={1:'Chemical'})
mRNAv_disease['Disease']='MONDO_0100096'
mRNAv_disease.drop(columns=['Drug Description'],inplace=True)
mRNAv_disease['Source'] = 'DrugBank'
mRNAv_disease.rename(columns={'Disease':':END_ID','Chemical':':START_ID'},inplace=True)
mRNAv_disease.head(n=3)

In [None]:
OBO_capable_of_inhibiting_or_preventing_pathological_process_OBO = mRNAv_disease.copy()
OBO_capable_of_inhibiting_or_preventing_pathological_process_OBO = OBO_capable_of_inhibiting_or_preventing_pathological_process_OBO.groupby(
    [':START_ID',':END_ID']).agg({'Source':set}).reset_index()
OBO_capable_of_inhibiting_or_preventing_pathological_process_OBO[":TYPE"] = "capable_of_inhibiting_or_preventing_pathological_process"
OBO_capable_of_inhibiting_or_preventing_pathological_process_OBO.to_pickle(
    unprocessed_edge_data_location+'OBO_capable_of_inhibiting_or_preventing_pathological_process_OBO.pkl')
OBO_capable_of_inhibiting_or_preventing_pathological_process_OBO.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002314 (characteristic of part of) - OBO

* [RSwitch database](https://penchovsky.atwebpages.com/applications.php?page=58) <br /> 
The RSwitch database contains information on using riboswitches as antibacterial drug targets. Each riboswitch represented by the ID, name, aptamer sequences, secondary structures, multiple alignments, consensus motifs, and biochemical pathways.

In [None]:
# Manual collection of data from https://penchovsky.atwebpages.com/applications.php?page=58
riboswitch_bactStrain = pd.read_csv(unprocessed_data_location + 'rswitch.csv', header=None) # NCBITaxon
riboswitch_bactStrain.head(n=3)

In [None]:
riboswitch_bactStrain[2].drop_duplicates().to_csv(
    unprocessed_data_location + 'bacteria.txt', header=None, sep='\n', index=None)
# --> https://www.ncbi.nlm.nih.gov/Taxonomy/TaxIdentifier/tax_identifier.cgi (Note that some manual work is needed)

In [None]:
bacteria = pd.read_csv(unprocessed_data_location + 'tax_report.txt', sep='\t\|\t', engine='python') 
bacteria.taxid = bacteria.taxid.astype('Int64')
bacteria.head(n=3)

In [None]:
riboswitch_bactStrain = pd.merge(riboswitch_bactStrain.rename(columns={2:'name'}),
                                 bacteria[['name','taxid']],on=['name'])
riboswitch_bactStrain.taxid = 'NCBITaxon_'+riboswitch_bactStrain.taxid.astype(str)
riboswitch_bactStrain = riboswitch_bactStrain[riboswitch_bactStrain.taxid != 'NCBITaxon_<NA>']
riboswitch_bactStrain = riboswitch_bactStrain.drop(columns=[1,'name'])
riboswitch_bactStrain = riboswitch_bactStrain.drop_duplicates()
riboswitch_bactStrain['Source'] = 'RSwitch'
riboswitch_bactStrain.rename(columns={0:':START_ID','taxid':':END_ID'},inplace=True)
riboswitch_bactStrain.head(n=3)

* TBDB

In [None]:
riboswitch_gobp = pd.read_csv(unprocessed_data_location+'tbdb.csv', sep=',') # riboswitch -- GO
gobp = riboswitch_gobp.protein_desc.str.rpartition('[')[2].str.rpartition(']')[0].str.replace(":", "_")
riboswitch_gobp = pd.concat([riboswitch_gobp, gobp.rename('gobp')], axis=1)
riboswitch_gobp = riboswitch_gobp[riboswitch_gobp.gobp.str.contains("GO", na=False)]
riboswitch_gobp = riboswitch_gobp[['accession_url', 'TaxId']].drop_duplicates()
riboswitch_gobp['TaxId'] = 'NCBITaxon_' + riboswitch_gobp['TaxId'].astype(str)
riboswitch_gobp['Source'] = 'TBDB'
riboswitch_gobp = riboswitch_gobp.rename(columns={'accession_url':':START_ID','TaxId':':END_ID'})
riboswitch_gobp.head(n=3)

* Ribocentre

In [None]:
ribocentre = ['URS00006C745E', 'URS00006C1D09']
ribocentre = pd.DataFrame(ribocentre, columns=['accession_url'])
ribocentre['TaxId'] = 'NCBITaxon_12475'
ribocentre['Source'] = 'Ribocentre'
ribocentre = ribocentre.rename(columns={'accession_url':':START_ID','TaxId':':END_ID'})
ribocentre.head(n=3)

In [None]:
riboswitch_bactStrain = pd.concat([riboswitch_bactStrain, riboswitch_gobp, ribocentre])
riboswitch_bactStrain = riboswitch_bactStrain.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
riboswitch_bactStrain[':TYPE'] = 'characteristic_of_part_of'
riboswitch_bactStrain.to_pickle(unprocessed_edge_data_location+'RNA_characteristic_of_part_of_OBO.pkl')
riboswitch_bactStrain.head(n=3)

***
### RSwitch - http://purl.obolibrary.org/obo/RO_0002387 (has potential to develop into) - OBO


* [RSwitch database](https://penchovsky.atwebpages.com/applications.php?page=58) <br /> 

In [None]:
riboswitch = riboswitch_bactStrain.drop(columns=[':END_ID'])
riboswitch[':END_ID'] = 'VO_0001281' # VO -- immunization target role of bacterial pathogen
riboswitch['Source'] = 'RSwitch'
riboswitch.head(n=3)

In [None]:
riboswitch = riboswitch.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
riboswitch[":TYPE"] = "has_potential_to_develop_into"
riboswitch.to_pickle(unprocessed_edge_data_location+'RNA_has_potential_to_develop_into_OBO.pkl')
riboswitch.head(n=3)

***
### Genome - http://purl.obolibrary.org/obo/RO_0002526 (overlaps sequence of) - RNA

* [ViroidDB](https://viroids.org/) <br />
ViroidDB is the most comprehensive collection of viroid, satellite RNA, retrozyme, and deltavirus genome sequences available on the internet. 

In [None]:
data_downloader('https://viroids.org/db/latest/all.json', unprocessed_data_location)

In [None]:
vRNA_ribozyme = pd.read_json(unprocessed_data_location + 'all.json').T 

# Extract ribozymes 
myre = re.compile(r"\n>> .*?\n")
ribozyme = [myre.findall(i) for i in vRNA_ribozyme.ribozymes]
ribozyme = [[j.replace("\n",'').replace(">> ",'') for j in i] for i in ribozyme]

# List of all possible ribozymes (useful for mapping)
a = [i for j in ribozyme for i in j]
set(a)

In [None]:
vRNA_ribozyme = pd.concat([vRNA_ribozyme.reset_index().drop(columns=['index']), # Genome --> NCBI nuccore 
                           pd.Series(ribozyme)], axis=1)
vRNA_ribozyme = vRNA_ribozyme.explode(0)
vRNA_ribozyme[0] = vRNA_ribozyme[0].str.split().str[0]
vRNA_ribozyme.drop(columns=['isolationSource','collectionDate','gc','bioSample','identicalSeqs','genBankTitle','displayTitle',
                            'length','sequenceType','nucCompleteness','genotype','segment','moleculeType','publications',
                           'geoLocation','country','usa','submitters','releaseDate','isolate',
                            'sequence','structure','type','Cls_ID80','genus','family','ribozymes',
                            'Cls_ID70','Cls_ID85','Cls_ID75','Cls_ID95','Cls_ID90','sraAccession','submitters','host'],
                   inplace=True)
vRNA_ribozyme.insert(0,1,vRNA_ribozyme.pop(0))
vRNA_ribozyme['accession'] = vRNA_ribozyme['accession'].str.split(".").str[0]
vRNA_ribozyme.head(n=3)

In [None]:
print(vRNA_ribozyme.species.unique()[:3])
# Among them, only Hepatitis delta virus (NCBI taxid: 12475) is a human pathogen
vRNA_ribozyme = vRNA_ribozyme[vRNA_ribozyme.species == 'Hepatitis delta virus']
rnacentral_map_rfam_delta = rnacentral_map_rfam[rnacentral_map_rfam['Organism'] == 12475]

In [None]:
vRNA_ribozyme = pd.merge(ribozyme_rfam_map,vRNA_ribozyme,left_on=0,right_on=1).drop(columns=['1_y'])
vRNA_ribozyme = pd.merge(vRNA_ribozyme.rename(columns={'1_x':'Rfam ID'}),rnacentral_map_rfam_delta[['RNAcentral ID','Rfam ID']].drop_duplicates(),
                         on='Rfam ID').drop(columns=['Rfam ID',0,'species'])

vRNA_ribozyme.rename(columns={'RNAcentral ID':':START_ID','accession':':END_ID'},inplace=True)
vRNA_ribozyme['Source'] = 'ViroidDB'
vRNA_ribozyme.head(n=3)

In [None]:
vRNA_ribozyme = vRNA_ribozyme.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
vRNA_ribozyme[":TYPE"] = "overlaps_sequence_of"
vRNA_ribozyme.to_pickle(unprocessed_edge_data_location+'RNA_overlaps_sequence_of_genome.pkl')

vRNA_ribozyme.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'},inplace=True)
vRNA_ribozyme.to_pickle(unprocessed_edge_data_location+'genome_overlaps_sequence_of_RNA.pkl')
vRNA_ribozyme.head(n=3)

***
### Genome - http://purl.obolibrary.org/obo/RO_0000052 (characteristic of) - OBO

* [ViroidDB](https://viroids.org/)

In [None]:
genome_characteristic_of_OBO = pd.DataFrame(vRNA_ribozyme[":START_ID"].unique()).rename(columns={0:':START_ID'})
genome_characteristic_of_OBO[':END_ID'] = ["NCBITaxon_12475"]*len(genome_characteristic_of_OBO)
genome_characteristic_of_OBO['Source'] = 'ViroidDB'
genome_characteristic_of_OBO = genome_characteristic_of_OBO.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
genome_characteristic_of_OBO[':TYPE'] = 'characteristic_of'
genome_characteristic_of_OBO.to_pickle(unprocessed_edge_data_location+'genome_characteristic_of_OBO.pkl')

OBO_has_characteristic_genome = genome_characteristic_of_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_has_characteristic_genome[":TYPE"] = "has_characteristic"
OBO_has_characteristic_genome.to_pickle(unprocessed_edge_data_location+'OBO_has_characteristic_genome.pkl')
OBO_has_characteristic_genome.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002556 (pathogen of) - OBO

* Rswitch
* TBDB
* Ribocentre

In [None]:
riboswitch_bactStrain = riboswitch_bactStrain.explode('Source')
riboswitch_bactStrain = riboswitch_bactStrain[[':END_ID', 'Source']].rename(columns={':END_ID':':START_ID'})
riboswitch_bactStrain[':END_ID'] = ['NCBITaxon_9606'] * len(riboswitch_bactStrain) # Homo Sapiens
riboswitch_bactStrain.head(n=3)

* [ViroidDB](https://viroids.org/)

In [None]:
OBO_pathogen_of_OBO = pd.DataFrame()
OBO_pathogen_of_OBO[':START_ID'] = ['NCBITaxon_12475']
OBO_pathogen_of_OBO[':END_ID'] = ['NCBITaxon_9606'] # Homo Sapiens
OBO_pathogen_of_OBO['Source'] = 'ViroidDB'
OBO_pathogen_of_OBO

In [None]:
OBO_pathogen_of_OBO = pd.concat([OBO_pathogen_of_OBO, riboswitch_bactStrain])
OBO_pathogen_of_OBO = OBO_pathogen_of_OBO.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
OBO_pathogen_of_OBO[":TYPE"] = "pathogen_of"
OBO_pathogen_of_OBO.to_pickle(unprocessed_edge_data_location+'OBO_pathogen_of_OBO.pkl')

OBO_has_pathogen_OBO = OBO_pathogen_of_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_has_pathogen_OBO[":TYPE"] = "has_pathogen"
OBO_has_pathogen_OBO.to_pickle(unprocessed_edge_data_location+'OBO_has_pathogen_OBO.pkl')
OBO_has_pathogen_OBO.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0001025 (located in) - OBO

* Human Protein Atlas

In [None]:
hpa = pd.read_csv(unprocessed_data_location + 'proteinatlas_search.tsv', header=0, delimiter='\t')
hpa.fillna('nan', inplace=True)
hpa.head(n=1)

In [None]:
hpa_results = []
for idx, row in tqdm(hpa.iterrows(), total=hpa.shape[0]):
    ens, gene, uniprot, evid = str(row['Ensembl']), str(row['Gene']), str(row['Uniprot']), str(row['Evidence'])
    if row['RNA tissue specific nTPM'] != 'nan':
        for x in row['RNA tissue specific nTPM'].split(';'):
            hpa_results += [[ens, gene, uniprot, evid, 'anatomy', str(x.split(':')[0]), str(x.split(': ')[1])]]
    if row['RNA cell line specific nTPM'] != 'nan':
        for x in row['RNA cell line specific nTPM'].split(';'):
            hpa_results += [[ens, gene, uniprot, evid, 'cell line', str(x.split(':')[0]), str(x.split(': ')[1])]]
    if row['RNA brain regional specific nTPM'] != 'nan':
        for x in row['RNA brain regional specific nTPM'].split(';'):
            hpa_results += [[ens, gene, uniprot, evid, 'anatomy', str(x.split(':')[0]), str(x.split(': ')[1])]]
    if row['RNA blood cell specific nTPM'] != 'nan':
        for x in row['RNA blood cell specific nTPM'].split(';'):
            hpa_results += [[ens, gene, uniprot, evid, 'anatomy', str(x.split(':')[0]), str(x.split(': ')[1])]]
    if row['RNA blood lineage specific nTPM'] != 'nan':
        for x in row['RNA blood lineage specific nTPM'].split(';'):
            hpa_results += [[ens, gene, uniprot, evid, 'anatomy', str(x.split(':')[0]), str(x.split(': ')[1])]]

In [None]:
with open(unprocessed_data_location + 'HPA_RNA_GENE_PROTEIN_EDGES.txt', 'w') as out:
    for x in tqdm(hpa_results):
        out.write(x[0] + '\t' + x[1] + '\t' + x[2] + '\t' + x[3] + '\t' + x[4] + '\t' + x[5] + '\t' + x[6] + '\n')

In [None]:
# load data, return edge count, and preview it
hpa_edges = pd.read_csv(unprocessed_data_location + 'HPA_RNA_GENE_PROTEIN_EDGES.txt',
                           header=None, low_memory=False, sep='\t',
                           names=['Ensembl_IDs', 'Gene_Symbols', 'Uniprot_IDs', 'Evidence',
                                   'Anatomy_Type', 'Anatomy', 'TPM'])

hpa_edges.head(n=3)

In [None]:
hpa_rna_edges = hpa_edges[['Gene_Symbols', 'Evidence', 'Anatomy', 'TPM']]
hpa_rna_edges = hpa_rna_edges[hpa_rna_edges['Evidence'] == 'Evidence at transcript level'].drop(columns=['Evidence'])
hpa_rna_edges.head(n=3)

In [None]:
hpa_rna_edges = pd.merge(hpa_rna_edges, symbol_ensembl_map.rename(columns={0:'Gene_Symbols'}), on='Gene_Symbols').drop(columns=['Gene_Symbols'])
hpa_rna_edges = pd.merge(hpa_rna_edges, hpa_gtex_map.rename(columns={0:'Anatomy'}), on='Anatomy').drop(columns=['Anatomy'])

hpa_rna_edges['Source'] = 'The_Human_Protein_Atlas'

hpa_rna_edges = pd.merge(hpa_rna_edges, rnacentral_map_human_ensembl
                                    [['RNAcentral ID','Ensembl transcript ID']].drop_duplicates().rename(
                                        columns={'Ensembl transcript ID':'1_x'}), on='1_x', how='left') 
hpa_rna_edges['RNAcentral ID'] = hpa_rna_edges['RNAcentral ID'].fillna(hpa_rna_edges['1_x'])
hpa_rna_edges.drop(columns=['1_x'],inplace=True)
RNA_located_in_OBO1 = hpa_rna_edges.copy()
RNA_located_in_OBO1 = RNA_located_in_OBO1.rename(columns={'RNAcentral ID':':START_ID','1_y':':END_ID'})
RNA_located_in_OBO1.head(n=3)

* GTEx

In [None]:
gtex = pd.read_csv(unprocessed_data_location + 'GTEx_Analysis_v10_RNASeQCv2.4.2_gene_median_tpm.gct', header=0, skiprows=2, delimiter='\t')
gtex.fillna('nan', inplace=True) 
gtex['Name'].replace('(\..*)','', inplace=True, regex=True)
gtex.head(n=3)

In [None]:
# remove rows that contain protein coding genes already in the hpa data
hpa_genes = list(hpa['Ensembl'].drop_duplicates(keep='first', inplace=False))
gtex = gtex.loc[gtex['Name'].apply(lambda x: x not in hpa_genes)]

# loop over data and re-organize - only keep results with tpm >= 1 and if gene symbol is not a protein-coding gene
gtex_results = []
for idx, row in tqdm(gtex.iterrows(), total=gtex.shape[0]):
    for col in list(gtex.columns)[2:]:
        typ = 'cell line' if 'Cells' in col else 'anatomy'
        if row[col] >= 1.0:
            evidence = 'Evidence at transcript level'
            gtex_results += [[str(row['Name']), str(row['Description']), 'nan', evidence, typ, str(col), str(row[col])]]

In [None]:
with open(unprocessed_data_location + 'GTEX_RNA_GENE_PROTEIN_EDGES.txt', 'w') as out:
    for x in tqdm(gtex_results):
        out.write(x[0] + '\t' + x[1] + '\t' + x[2] + '\t' + x[3] + '\t' + x[4] + '\t' + x[5] + '\t' + x[6] + '\n')

In [None]:
# load data, return edge count, and preview it
hpa_edges = pd.read_csv(unprocessed_data_location + 'GTEX_RNA_GENE_PROTEIN_EDGES.txt',
                           header=None, low_memory=False, sep='\t',
                           names=['Ensembl_IDs', 'Gene_Symbols', 'Uniprot_IDs', 'Evidence',
                                   'Anatomy_Type', 'Anatomy', 'TPM'])

hpa_edges.head(n=3)

In [None]:
hpa_rna_edges = hpa_edges[['Gene_Symbols', 'Evidence', 'Anatomy', 'TPM']]
hpa_rna_edges = hpa_rna_edges[hpa_rna_edges['Evidence'] == 'Evidence at transcript level'].drop(columns=['Evidence'])
hpa_rna_edges = pd.merge(hpa_rna_edges, symbol_ensembl_map.rename(columns={0:'Gene_Symbols'}), on='Gene_Symbols').drop(columns=['Gene_Symbols'])
hpa_rna_edges = pd.merge(hpa_rna_edges, hpa_gtex_map.rename(columns={0:'Anatomy'}), on='Anatomy').drop(columns=['Anatomy'])

hpa_rna_edges['Source'] = 'GTEx'

hpa_rna_edges = pd.merge(hpa_rna_edges, rnacentral_map_human_ensembl
                                    [['RNAcentral ID','Ensembl transcript ID']].drop_duplicates().rename(
                                        columns={'Ensembl transcript ID':'1_x'}), on='1_x', how='left') 
hpa_rna_edges['RNAcentral ID'] = hpa_rna_edges['RNAcentral ID'].fillna(hpa_rna_edges['1_x'])
hpa_rna_edges.drop(columns=['1_x'],inplace=True)
RNA_located_in_OBO2 = hpa_rna_edges.copy()
RNA_located_in_OBO2 = RNA_located_in_OBO2.rename(columns={'RNAcentral ID':':START_ID','1_y':':END_ID'})
RNA_located_in_OBO2.head(n=3)

* [RNALocate](http://www.rnalocate.org/) <br/> RNALocate aims to provide a resource for efficient manipulation, browsing and analysis of RNA subcellular localization.

In [None]:
!wget http://www.rnalocate.org/static/download/All%20RNA%20subcellular%20localization%20information.zip -O ../resources/processed_data/unprocessed_data/All%20RNA%20subcellular%20localization%20information.zip
with zipfile.ZipFile('../resources/processed_data/unprocessed_data/All RNA subcellular localization information.zip', 'r') as zip_ref:
    zip_ref.extractall('../resources/processed_data/unprocessed_data/')

In [None]:
RNA_location = pd.read_csv(unprocessed_data_location+'All RNA subcellular localization information.txt',sep='\t') # Subcellular localization (GO)
RNA_location = RNA_location[(RNA_location['Species'].str.contains('apiens')) &
                            (RNA_location['GO_Accession'].notna())].drop(columns=['Species','Subcellular_Localization','RNALocate_ID'])
RNA_location['GO_Accession'] = RNA_location['GO_Accession'].str.replace(':',"_")
RNA_location['PubMed_ID'] = RNA_location['PubMed_ID'].astype('Int64').astype(str)
# We keep only entries score is >= 0.95 (see http://www.rnalocate.org/help Q9)
RNA_location = RNA_location[RNA_location['RNALocate_Score'] >= 0.95]

RNA_location['PubMed_ID'] = pd.to_numeric(RNA_location['PubMed_ID'], errors='coerce')
RNA_location['PubMed_ID'] = RNA_location['PubMed_ID'].replace(0, np.nan)
RNA_location['PubMed_ID'] = pd.to_numeric(RNA_location['PubMed_ID'], errors='coerce')
RNA_location['PubMed_ID'] = RNA_location['PubMed_ID'].astype(str).str.replace(".0", "")
RNA_location['PubMed_ID'] = RNA_location['PubMed_ID'].replace("<NA>", np.nan)
RNA_location['PubMed_ID'] = RNA_location['PubMed_ID'].replace("nan", np.nan)

print(RNA_location.RNA_Type.unique())
RNA_location['Source'] = 'RNALocate'
RNA_location.head(n=3)

In [None]:
mRNA_location = RNA_location[RNA_location['RNA_Type'] == 'mRNA']
mrna_ensembl_map = ensembl_map[['transcript_stable_id','symbol','ensembl_transcript_type']]
mrna_ensembl_map = mrna_ensembl_map[mrna_ensembl_map['ensembl_transcript_type'] == 'protein_coding'].drop(
    columns='ensembl_transcript_type').drop_duplicates()
mRNA_location = pd.merge(mRNA_location, mrna_ensembl_map, left_on='RNA_Symbol',right_on='symbol')
mRNA_location = mRNA_location.rename(columns={'transcript_stable_id':':START_ID','GO_Accession':':END_ID',
                                              'RNALocate_Score':'RNAsister_score',
                                              'PubMed_ID':'PubMedID'}).drop(columns=['RNA_Type'])
mRNA_location.head(n=3)

In [None]:
ncRNA_location = RNA_location[(RNA_location['RNA_Type'] != 'miRNA') & (RNA_location['RNA_Type'] != 'mRNA')]
ncRNA_location = pd.merge(ncRNA_location, rnacentral_map_human_hgnc.rename(
    columns={'HGNC symbol':'RNA_Symbol','RNA category':'RNA_Type'}), on=['RNA_Symbol','RNA_Type'])
ncRNA_location = ncRNA_location.rename(columns={'RNAcentral ID':':START_ID','GO_Accession':':END_ID',
                                              'RNALocate_Score':'RNAsister_score',
                                              'PubMed_ID':'PubMedID'}).drop(columns=['RNA_Type'])
ncRNA_location.head(n=3)

In [None]:
miRNA_location = RNA_location[RNA_location['RNA_Type'] == 'miRNA'].drop(columns=['RNA_Type'])
print(miRNA_location[~miRNA_location['RNA_Symbol'].isin(rnacentral_map_human['DB Description'])]['RNA_Symbol'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = miRNA_location[~miRNA_location['RNA_Symbol'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = miRNA_location[~miRNA_location['RNA_Symbol'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['RNA_Symbol'] = miRNA_RNA_miRNAnotInRNAcentral5p['RNA_Symbol'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['RNA_Symbol'] = miRNA_RNA_miRNAnotInRNAcentral3p['RNA_Symbol'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'RNA_Symbol'}), on='RNA_Symbol').drop(columns=['RNA_Symbol']).rename(columns={'RNAcentral ID':'RNA'})

miRNA_location = pd.merge(miRNA_location, rnacentral_map_human.rename(columns={'DB Description':'RNA_Symbol'}), on='RNA_Symbol')
miRNA_location = pd.concat([miRNA_location.rename(columns={'RNAcentral ID':'RNA'}),
                       miRNA_RNA_miRNAnotInRNAcentral]).drop(columns=['DB', 'Organism', 'RNA category','DB ID','RNA_Symbol'])

miRNA_location['Source'] = 'RNALocate'
miRNA_location = miRNA_location.rename(columns={'RNA':':START_ID','GO_Accession':':END_ID',
                                                'RNALocate_Score':'RNAsister_score',
                                                'PubMed_ID':'PubMedID'})
miRNA_location.head(n=3)

In [None]:
RNA_located_in_OBO3 = pd.concat([mRNA_location, ncRNA_location, miRNA_location]).copy()
RNA_located_in_OBO3.head(n=3)

* [piRBase](http://bigdata.ibp.ac.cn/piRBase/)

In [None]:
df = pd.DataFrame()
for i in rnacentral_map_human_pirbase['piRBase ID'].unique(): # Uberon+CLO
    #response = requests.get(url = 'http://bigdata.ibp.ac.cn/piRBase/pirna.php?name=' + i)
    #with open(f'../resources/processed_data/unprocessed_data/piRBase/{i}.html', 'wb') as file:
    #    file.write(response.content)
    temp = pd.read_html(f'../resources/processed_data/unprocessed_data/piRBase/{i}.html')[1]
    temp['piRNA'] = i
    df = pd.concat([df, temp])

df = df.drop(columns=['Reads'])
df.head(n=3)

In [None]:
df['Tissue'] = df['Tissue'].str.lower()
df['Tissue'] = df['Tissue'].str.replace(r'\(.*', '', regex=True)
df['Tissue'] = df['Tissue'].str.replace('adult ', '')
df['Tissue'] = df['Tissue'].str.replace(' from 2nd trimester embryos', '')
df['Tissue'] = df['Tissue'].str.replace(' from 1st trimester embryos', '')
df['Tissue'] = df['Tissue'].str.replace('mesenchymal stromal cells from bone marrow', ' stroma of bone marrow')
df['Tissue'] = df['Tissue'].str.replace(', neurosurgical fluid, extracellular vesicles', '')
df['Tissue'] = df['Tissue'].str.replace('2 cell ', '')
df['Tissue'] = df['Tissue'].str.replace(')', '')
df['Tissue'] = df['Tissue'].str.replace('morula embryo', 'morula')
df['Tissue'] = df['Tissue'].str.strip()
df['Tissue'].unique()

In [None]:
df = df.merge(desc_anatomyCell_map.rename(columns={0:'Tissue'}), on='Tissue').drop(columns=['Tissue']).rename(columns={1:'Tissue'})
df = pd.merge(df, rnacentral_map_human_pirbase.rename(columns={'piRBase ID':'piRNA'}), on='piRNA').drop(
    columns=['piRNA']).rename(columns={'RNAcentral ID':'RNA'})
df.drop(columns=['Dataset'],inplace=True)

df['PubMed'] = pd.to_numeric(df['PubMed'], errors='coerce')
df['PubMed'] = df['PubMed'].replace(0, np.nan)
df['PubMed'] = pd.to_numeric(df['PubMed'], errors='coerce')
df['PubMed'] = df['PubMed'].astype(str).str.replace(".0", "")
df['PubMed'] = df['PubMed'].replace("<NA>", np.nan)
df['PubMed'] = df['PubMed'].replace("nan", np.nan)

df['Accession'] = "NCBI/geo/query/acc.cgi?acc=" + df['Accession'].astype(str)

df['Method'] = df['Method'].str.lower().str.strip()
df = pd.merge(df, method_map, right_on='0_y', left_on='Method', how='left')
df['0_x'] = df['0_x'].fillna(df['Method'])
df = df.drop(columns=['0_y', 'Method'])
df = df.rename(columns={'0_x':'Method'})

df = pd.concat([df.drop(columns='Accession'), df.drop(columns='Method').rename(columns={'Accession':'Method'})])

df.rename(columns={'RNA':':START_ID','Tissue':':END_ID','PubMed':'PubMedID'},inplace=True)
df['Source'] = 'piRBase'
RNA_located_in_OBO4 = df.copy()
RNA_located_in_OBO4.head(n=3)

* [circBase](http://www.circbase.org/) <br /> circBase is a database where merged and unified data sets of circRNAs and the evidence supporting their expression can be accessed, downloaded, and browsed within the genomic context.

In [None]:
!wget http://www.circbase.org/download/hsa_hg19_circRNA.txt -P ../resources/processed_data/unprocessed_data/hsa_hg19_circRNA.txt

In [None]:
circbase = pd.read_csv(unprocessed_data_location + 'hsa_hg19_circRNA.txt', sep='\t') # CLO+Uberon
circrna_clo_anatomy = circbase[['circRNA ID','samples','best transcript','circRNA study']].drop_duplicates()
circrna_clo_anatomy['samples'] = circrna_clo_anatomy['samples'].str.lower()
circrna_clo_anatomy['samples'] = circrna_clo_anatomy['samples'].str.split(', ')
circrna_clo_anatomy = circrna_clo_anatomy.explode('samples')
circrna_clo_anatomy = circrna_clo_anatomy.merge(desc_anatomyCell_map.rename(columns={0:'samples'}),
                                     on='samples').drop(columns=['samples']).rename(columns={1:'Tissue'})
print(circrna_clo_anatomy['circRNA study'].unique())
circrna_clo_anatomy['circRNA study'] = circrna_clo_anatomy['circRNA study'].str.replace("Salzman2013","22319583")
circrna_clo_anatomy['circRNA study'] = circrna_clo_anatomy['circRNA study'].str.replace("Memczak2013","23446348")
circrna_clo_anatomy['circRNA study'] = circrna_clo_anatomy['circRNA study'].str.replace("Zhang2013","25242744")
circrna_clo_anatomy['circRNA study'] = circrna_clo_anatomy['circRNA study'].str.replace("Jeck2013","23249747")
circrna_clo_anatomy['circRNA study'] = circrna_clo_anatomy['circRNA study'].str.split(', ')
circrna_clo_anatomy = circrna_clo_anatomy.explode('circRNA study')
circrna_clo_anatomy['Source'] = 'circBase'
circrna_clo_anatomy = circrna_clo_anatomy.rename(columns={'circRNA ID':':START_ID','Tissue':':END_ID','circRNA study':'PubMedID'})
RNA_located_in_OBO5 = circrna_clo_anatomy.copy()
RNA_located_in_OBO5.head(n=3)

* [LncBase](https://diana.e-ce.uth.gr/lncbasev3/home) <br /> DIANA-LncBase v3 is a reference repository with experimentally supported miRNA targets on long non-coding transcripts.

In [None]:
# https://diana.e-ce.uth.gr/lncbasev3/interactions --> check all filters --> "Download" button --> CSV -->
# --> Give consent --> Download file via link received by e-mail

In [None]:
lncRNA_anatomy = pd.read_csv(unprocessed_data_location + 'lncbase_lncrna-anatomy.csv', sep="\t").drop(columns=['Species']) # Uberon+CLO
lncRNA_anatomy['Tissue'] = lncRNA_anatomy['Tissue'].str.lower()
lncRNA_anatomy['Cell Type'] = lncRNA_anatomy['Cell Type'].str.lower()

lncRNA_anatomy = pd.concat([lncRNA_anatomy.drop(columns=['Cell Type']),lncRNA_anatomy.drop(
    columns=['Tissue']).rename(columns={'Cell Type':'Tissue'})])

lncRNA_anatomy = lncRNA_anatomy.drop(columns=['Gene Id','Gene Name'])
lncRNA_anatomy = lncRNA_anatomy[~lncRNA_anatomy['Tissue'].isna()]
print(lncRNA_anatomy['Transcript Id'].str[:3].unique())
lncRNA_anatomy['Transcript Id'] = lncRNA_anatomy['Transcript Id'].str.split('.').str[0]

lncRNA_anatomy = lncRNA_anatomy.groupby(["Transcript Id","Tissue","Category"]).agg({'TPM':np.mean}).reset_index()
lncRNA_anatomy['Source'] = 'LncBase'

print(lncRNA_anatomy.TPM.mean())
lncRNA_anatomy = lncRNA_anatomy[lncRNA_anatomy['TPM'] >= lncRNA_anatomy.TPM.mean()]

lncRNA_anatomy = pd.merge(lncRNA_anatomy, desc_anatomyCell_map.rename(columns={0:'Tissue'}), on='Tissue').drop(columns=['Tissue']).rename(
    columns={1:'Tissue'})

lncRNA_anatomy = pd.merge(lncRNA_anatomy, pd.concat([rnacentral_map_human_refseq[['RNAcentral ID','RefSeq ID']],
                                                     rnacentral_map_human_refseq[['RNAcentral ID','Label']].rename(
                                                         columns={'Label':'RefSeq ID'})]).drop_duplicates().dropna().rename(
                                                             columns={'RNAcentral ID':'Transcript Id'}),
                                                                on='Transcript Id').drop(columns=['RefSeq ID'])

lncRNA_anatomy['Category'] = lncRNA_anatomy['Category'].str.split(",")
lncRNA_anatomy = lncRNA_anatomy.explode('Category')
print(lncRNA_anatomy['Category'].unique())
lncRNA_anatomy['Category'] = lncRNA_anatomy['Category'].str.replace('Cancer/Malignant','cancer cell line')
lncRNA_anatomy['Category'] = lncRNA_anatomy['Category'].str.replace('Stem/Progenitor','stem cell line cell')
lncRNA_anatomy['Category'] = lncRNA_anatomy['Category'].str.replace('Normal/Primary','primary cultured cell')
lncRNA_anatomy['Category'] = lncRNA_anatomy['Category'].str.replace('Embryonic/Fetal','embryonic stem cell line cell')

RNA_located_in_OBO6 = lncRNA_anatomy.copy()
RNA_located_in_OBO6.rename(columns={'Transcript Id':':START_ID','Tissue':':END_ID','Category':'Location'},inplace=True)
RNA_located_in_OBO6.head(n=3)

* [miRandola](http://mirandola.iit.cnr.it/) <br /> miRandola is a comprehensive manually curated classification of different extracellular circulating non-coding RNA types.

In [None]:
!wget http://mirandola.iit.cnr.it/download/miRandola_version_02_2017.txt -O ../resources/processed_data/unprocessed_data/miRandola_version_02_2017.txt

In [None]:
RNA_ev = pd.read_csv(unprocessed_data_location+'miRandola_version_02_2017.txt', sep='\t')# circBase -- Extracellular form - GO
print(RNA_ev.RNA_class.unique())
RNA_ev.head(n=2)

In [None]:
circRNA_ev = RNA_ev[(RNA_ev['circRNA_accession'].notna()) & (RNA_ev['organism'].str.contains('apiens'))]
circRNA_ev['circRNA_accession'] = circRNA_ev['circRNA_accession'].str.lower()
circRNA_ev = circRNA_ev[circRNA_ev['circRNA_accession'].str.match(r'hsa_circ_\d+')]
# circRNA in miRandola only circulates in blood
circRNA_ev['GO'] = 'GO_0072562'

circRNA_ev['PubMed_ID'] = pd.to_numeric(circRNA_ev['PubMed_ID'], errors='coerce')
circRNA_ev['PubMed_ID'] = circRNA_ev['PubMed_ID'].replace(0, np.nan)
circRNA_ev['PubMed_ID'] = pd.to_numeric(circRNA_ev['PubMed_ID'], errors='coerce')
circRNA_ev['PubMed_ID'] = circRNA_ev['PubMed_ID'].astype(str).str.replace(".0", "")
circRNA_ev['PubMed_ID'] = circRNA_ev['PubMed_ID'].replace("<NA>", np.nan)
circRNA_ev['PubMed_ID'] = circRNA_ev['PubMed_ID'].replace("nan", np.nan)

circRNA_ev['disease_or_cell_line'] = circRNA_ev['disease_or_cell_line'].str.lower()
circRNA_ev = pd.merge(circRNA_ev, disease_map, right_on='0_y', left_on='disease_or_cell_line', how='left')
circRNA_ev['0_x'] = circRNA_ev['0_x'].fillna(circRNA_ev['disease_or_cell_line'])
circRNA_ev = circRNA_ev.drop(columns=['0_y', 'disease_or_cell_line'])
circRNA_ev = circRNA_ev.rename(columns={'0_x':'Location'})
circRNA_ev['Location'] = circRNA_ev['Location'].replace("normal", np.nan)

circRNA_ev['method'] = circRNA_ev['method'].str.lower()
circRNA_ev['method'] = circRNA_ev['method'].str.replace(r"\(.*?\)", "", regex=True)
circRNA_ev['method'] = circRNA_ev['method'].str.split("|")
circRNA_ev['method'] = circRNA_ev['method'].explode("method")
circRNA_ev['method'] = circRNA_ev['method'].str.strip()
circRNA_ev = pd.merge(circRNA_ev, method_map, right_on='0_y', left_on='method', how='left')
circRNA_ev['0_x'] = circRNA_ev['0_x'].fillna(circRNA_ev['method'])
circRNA_ev = circRNA_ev.drop(columns=['0_y', 'method'])
circRNA_ev = circRNA_ev.rename(columns={'0_x':'Method'})

circRNA_ev = circRNA_ev.drop(columns=['RNA','RNA_class','miRBase_Last_Version','miRBase_accession','miRBase_Last_Version',
                                    'miRBase_family','organism','description','sample'])
circRNA_ev['Source'] = 'miRandola'
circRNA_ev.rename(columns={'circRNA_accession':':START_ID','GO':':END_ID','PubMed_ID':'PubMedID'},inplace=True)
RNA_located_in_OBO7 = circRNA_ev.copy()
RNA_located_in_OBO7.head(n=3)

* [miRandola](http://mirandola.iit.cnr.it/) 

In [None]:
mirnaRNA_ev = RNA_ev[(RNA_ev['miRBase_accession'].notna()) & (RNA_ev['organism'].str.contains('apiens'))] # Extracellular form - GO
mirnaRNA_ev['sample'] = mirnaRNA_ev['sample'].str.strip().str.lower()

mirnaRNA_ev = mirnaRNA_ev.drop(columns=['RNA','RNA_class','miRBase_Last_Version','circRNA_accession','miRBase_Last_Version',
                                    'miRBase_family','organism','description'])
print(mirnaRNA_ev[~mirnaRNA_ev['miRBase_accession'].isin(rnacentral_map_human_mirbase['miRBase ID'])]['miRBase_accession'].unique())
mirnaRNA_ev['miRBase_accession'] = mirnaRNA_ev['miRBase_accession'].str.replace('MIMAT0005905','URS000047047A')
mirnaRNA_ev['miRBase_accession'] = mirnaRNA_ev['miRBase_accession'].str.replace('MIMAT0015090','URS00003CF845')
mirnaRNA_ev['miRBase_accession'] = mirnaRNA_ev['miRBase_accession'].replace('MIMAT0005954',np.nan)
mirnaRNA_ev = pd.merge(mirnaRNA_ev, rnacentral_map_human_mirbase[['miRBase ID','RNAcentral ID']].rename(
    columns={'miRBase ID':'miRBase_accession'}).drop_duplicates(), on='miRBase_accession').rename(
        columns={'RNAcentral ID':'RNA'})
mirnaRNA_ev['RNA'] = mirnaRNA_ev['RNA'].fillna(mirnaRNA_ev['miRBase_accession'])

mirnaRNA_ev['PubMed_ID'] = pd.to_numeric(mirnaRNA_ev['PubMed_ID'], errors='coerce')
mirnaRNA_ev['PubMed_ID'] = mirnaRNA_ev['PubMed_ID'].replace(0, np.nan)
mirnaRNA_ev['PubMed_ID'] = pd.to_numeric(mirnaRNA_ev['PubMed_ID'], errors='coerce')
mirnaRNA_ev['PubMed_ID'] = mirnaRNA_ev['PubMed_ID'].astype(str).str.replace(".0", "")
mirnaRNA_ev['PubMed_ID'] = mirnaRNA_ev['PubMed_ID'].replace("<NA>", np.nan)
mirnaRNA_ev['PubMed_ID'] = mirnaRNA_ev['PubMed_ID'].replace("nan", np.nan)

mirnaRNA_ev['disease_or_cell_line'] = mirnaRNA_ev['disease_or_cell_line'].str.lower()
mirnaRNA_ev['disease_or_cell_line'] = mirnaRNA_ev['disease_or_cell_line'].str.replace(r"\(.*?\)", "", regex=True)
mirnaRNA_ev = pd.merge(mirnaRNA_ev, disease_map, right_on='0_y', left_on='disease_or_cell_line', how='left')
mirnaRNA_ev['0_x'] = mirnaRNA_ev['0_x'].fillna(mirnaRNA_ev['disease_or_cell_line'])
mirnaRNA_ev = mirnaRNA_ev.drop(columns=['0_y', 'disease_or_cell_line'])
mirnaRNA_ev = mirnaRNA_ev.rename(columns={'0_x':'Location'})
mirnaRNA_ev['Location'] = mirnaRNA_ev['Location'].replace("normal", np.nan)

mirnaRNA_ev['method'] = mirnaRNA_ev['method'].str.lower()
mirnaRNA_ev['method'] = mirnaRNA_ev['method'].str.replace(r"\(.*?\)", "", regex=True)
mirnaRNA_ev['method'] = mirnaRNA_ev['method'].str.split("|")
mirnaRNA_ev['method'] = mirnaRNA_ev['method'].explode("method")
mirnaRNA_ev['method'] = mirnaRNA_ev['method'].str.strip()
mirnaRNA_ev = pd.merge(mirnaRNA_ev, method_map, right_on='0_y', left_on='method', how='left')
mirnaRNA_ev['0_x'] = mirnaRNA_ev['0_x'].fillna(mirnaRNA_ev['method'])
mirnaRNA_ev = mirnaRNA_ev.drop(columns=['0_y', 'method'])
mirnaRNA_ev = mirnaRNA_ev.rename(columns={'0_x':'Method'})

mirnaRNA_ev = pd.merge(mirnaRNA_ev, desc_go_map.rename(columns={0:'sample'}), on='sample').drop(columns=['sample']).rename(
        columns={1:'GO'})
mirnaRNA_ev['Source'] = 'miRandola'
RNA_located_in_OBO8 = mirnaRNA_ev.copy()
RNA_located_in_OBO8.rename(columns={'RNA':':START_ID','GO':':END_ID','PubMed_ID':'PubMedID'},inplace=True)
RNA_located_in_OBO8.head(n=3)

In [None]:
lncrnaRNA_ev = RNA_ev[(RNA_ev['RNA'].notna()) & (RNA_ev['organism'].str.contains('apiens')) & (RNA_ev['RNA_class'] == 'lncRNA')]
lncrnaRNA_ev['sample'] = lncrnaRNA_ev['sample'].str.strip().str.lower()
lncrnaRNA_ev['RNA'] = lncrnaRNA_ev['RNA'].str.strip().str.upper()

lncrnaRNA_ev = lncrnaRNA_ev.drop(columns=['RNA_class','miRBase_Last_Version','circRNA_accession','miRBase_Last_Version',
                                    'miRBase_family','organism','description'])
lncrnaRNA_ev = pd.merge(lncrnaRNA_ev, rnacentral_map_human_hgnc[['HGNC symbol','RNAcentral ID']].rename(
    columns={'HGNC symbol':'RNA'}).drop_duplicates(), on='RNA').drop(columns=['RNA']).rename(columns={'RNAcentral ID':'RNA'})
lncrnaRNA_ev = pd.merge(lncrnaRNA_ev, desc_go_map.rename(columns={0:'sample'}), on='sample').drop(columns=['sample']).rename(
        columns={1:'GO'})

lncrnaRNA_ev['PubMed_ID'] = pd.to_numeric(lncrnaRNA_ev['PubMed_ID'], errors='coerce')
lncrnaRNA_ev['PubMed_ID'] = lncrnaRNA_ev['PubMed_ID'].replace(0, np.nan)
lncrnaRNA_ev['PubMed_ID'] = pd.to_numeric(lncrnaRNA_ev['PubMed_ID'], errors='coerce')
lncrnaRNA_ev['PubMed_ID'] = lncrnaRNA_ev['PubMed_ID'].astype(str).str.replace(".0", "")
lncrnaRNA_ev['PubMed_ID'] = lncrnaRNA_ev['PubMed_ID'].replace("<NA>", np.nan)
lncrnaRNA_ev['PubMed_ID'] = lncrnaRNA_ev['PubMed_ID'].replace("nan", np.nan)

lncrnaRNA_ev['disease_or_cell_line'] = lncrnaRNA_ev['disease_or_cell_line'].str.lower()
lncrnaRNA_ev = pd.merge(lncrnaRNA_ev, disease_map, right_on='0_y', left_on='disease_or_cell_line', how='left')
lncrnaRNA_ev['0_x'] = lncrnaRNA_ev['0_x'].fillna(lncrnaRNA_ev['disease_or_cell_line'])
lncrnaRNA_ev = lncrnaRNA_ev.drop(columns=['0_y', 'disease_or_cell_line'])
lncrnaRNA_ev = lncrnaRNA_ev.rename(columns={'0_x':'Location'})
lncrnaRNA_ev['Location'] = lncrnaRNA_ev['Location'].replace("normal", np.nan)

lncrnaRNA_ev['method'] = lncrnaRNA_ev['method'].str.lower()
lncrnaRNA_ev['method'] = lncrnaRNA_ev['method'].str.replace(r"\(.*?\)", "", regex=True)
lncrnaRNA_ev['method'] = lncrnaRNA_ev['method'].str.split("|")
lncrnaRNA_ev['method'] = lncrnaRNA_ev['method'].explode("method")
lncrnaRNA_ev['method'] = lncrnaRNA_ev['method'].str.strip()
lncrnaRNA_ev = pd.merge(lncrnaRNA_ev, method_map, right_on='0_y', left_on='method', how='left')
lncrnaRNA_ev['0_x'] = lncrnaRNA_ev['0_x'].fillna(lncrnaRNA_ev['method'])
lncrnaRNA_ev = lncrnaRNA_ev.drop(columns=['0_y', 'method'])
lncrnaRNA_ev = lncrnaRNA_ev.rename(columns={'0_x':'Method'})

lncrnaRNA_ev['Source'] = 'miRandola'
lncrnaRNA_ev.rename(columns={'RNA':':START_ID','GO':':END_ID','PubMed_ID':'PubMedID'},inplace=True)
RNA_located_in_OBO9 = lncrnaRNA_ev.copy()
RNA_located_in_OBO9.head(n=3)

* [TAM](http://www.lirmed.com/tam2/)

In [None]:
with open(unprocessed_data_location+'mirset_v9.txt', 'r') as file:
    data = file.read().rstrip()
    
TAM = pd.DataFrame([ ln.rstrip().split('\t') for ln in
    io.StringIO(data).readlines() ]).fillna('')

miRNA_anatomy = TAM[(TAM[0].str.contains("TissueSpecific"))]# Uberon
miRNA_anatomy=miRNA_anatomy.drop(columns=[0])
miRNA_anatomy=miRNA_anatomy.dropna(axis=1, how='all')
miRNA_anatomy=pd.concat([miRNA_anatomy,miRNA_anatomy.loc[1236].to_frame().T])
miRNA_anatomy=miRNA_anatomy.reset_index(drop=True)
miRNA_anatomy.iloc[(3)][1] = "Heart"
miRNA_anatomy.iloc[(6)][1] = "Muscle"
miRNA_anatomy.head(n=1)

In [None]:
miRNA_anatomy['merged'] = miRNA_anatomy[miRNA_anatomy.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)
miRNA_anatomy['Uberon'] = ['UBERON_0002369', 'UBERON_0000955', 'UBERON_0001155',
                           'UBERON_0002349', 'UBERON_0001150', 'UBERON_0001987', 'UBERON_0001630']
miRNA_anatomy=miRNA_anatomy[[1,'Uberon','merged']]
miRNA_anatomy['merged'] = miRNA_anatomy.merged.str.split(',')
miRNA_anatomy = miRNA_anatomy.explode('merged')
miRNA_anatomy['Source'] = 'TAM'
miRNA_anatomy.head(n=1)

In [None]:
miRNA_anatomy = miRNA_anatomy[miRNA_anatomy['merged'] != '']

print(all(miRNA_anatomy['merged'].isin(rnacentral_map_human['DB Description'])))
print(miRNA_anatomy[~miRNA_anatomy['merged'].isin(rnacentral_map_human['DB Description'])]['merged'].unique())
miRNA_anatomy['merged'] = miRNA_anatomy['merged'].str.replace('hsa-mir-194', 'hsa-mir-194-1')
miRNA_anatomy = pd.concat([miRNA_anatomy, miRNA_anatomy[miRNA_anatomy['merged'] == 'hsa-mir-194-1'].assign(merged='hsa-mir-194-2')])
miRNA_anatomy = pd.merge(miRNA_anatomy, rnacentral_map_human.rename(columns={'DB Description':'merged'}), on='merged').drop(
    columns=['merged',1,'DB','DB ID','Organism','RNA category']).rename(columns={'RNAcentral ID':':START_ID', 'Uberon':':END_ID'})

miRNA_anatomy['Source'] = 'miRPathDB'
RNA_located_in_OBO10 = miRNA_anatomy.copy()
RNA_located_in_OBO10.head(n=3)

* [miRPathDB](https://mpd.bioinf.uni-sb.de/overview.html)

In [None]:
#tar = tarfile.open(unprocessed_data_location+'miRPathDB2_hsa_gmt.tar.gz', 'r:gz')
#tar.extractall(unprocessed_data_location)
#tar.close()

with open(unprocessed_data_location+'hsa/GO_CC_validated_miRTarBase_strong.gmt', 'r') as file:# GO
    data = file.read().rstrip()
    
miRNA_GO = pd.DataFrame([ ln.rstrip().split('\t') for ln in
    io.StringIO(data).readlines() ]).fillna('')

miRNA_GO[0] = miRNA_GO[0].str.lower()
miRNA_GO = miRNA_GO.dropna(axis=1, how='all')
miRNA_GO = miRNA_GO.drop(columns=[1])
miRNA_GO = pd.merge(desc_go_map, miRNA_GO, on=[0])
miRNA_GO = miRNA_GO.dropna(axis=1, how='all')

dflist = list()
for i in range(len(miRNA_GO)):
    df = pd.DataFrame(columns=[0, 1, 2])
    df[0] = miRNA_GO.T[i].drop(index=[0, 1])
    df[1] = miRNA_GO.T[i][0]
    df[2] = miRNA_GO.T[i][1]
    dflist.append(df)

miRNA_GO = pd.DataFrame(columns=[0, 1, 2])
for dataframe in dflist:
    miRNA_GO = pd.concat([miRNA_GO, dataframe])
miRNA_GO = miRNA_GO.dropna()
miRNA_GO = miRNA_GO[miRNA_GO[0] != '']

print(all(miRNA_GO[0].isin(rnacentral_map_human['DB Description'])))
miRNA_GO = pd.merge(miRNA_GO, rnacentral_map_human.rename(columns={'DB Description':0}), on=0).drop(
    columns=[0,1,'DB','DB ID','Organism','RNA category']).rename(columns={'RNAcentral ID':':START_ID', 2:':END_ID'})

miRNA_GO['Source'] = 'miRPathDB'
RNA_located_in_OBO11 = miRNA_GO.copy()
RNA_located_in_OBO11.head(n=3)

* [lncATLAS](https://lncatlas.crg.eu/) <br /> LncATLAS displays the subcellular localisation for GENCODE-annotated lncRNAs. This localisation is expressed in units of Relative Concentration Index (RCI) - a comparison of the concentration of a gene, per unit mass of RNA, between two cellular compartments.

In [None]:
# https://lncatlas.crg.eu --> Get Raw Data --> Download All raw data
lncRNA_comp = pd.read_csv(unprocessed_data_location + '2024-12-23_lncATLAS_all_data.csv').drop(columns=['Gene Name','Coding Type','Biotype'])# GO

# Mapping to GO CC
print(lncRNA_comp['Data Type'].unique())
lncRNA_comp['GO'] = lncRNA_comp['Data Type'].replace({'nucleus': 'GO_0005634', 'cytosol': 'GO_0005829',
                                                        'chromatin': 'GO_0000785', 'membrane': 'GO_0016020',
                                                        'nucleolus': 'GO_0005730', 'nucleoplasm': 'GO_0005654'})
lncRNA_comp = lncRNA_comp[lncRNA_comp['GO'].astype(str).str.startswith('GO_')]
lncRNA_comp.drop(columns=['Data Type'],inplace=True)

# Data cleaning rule to estabilish relations: discard FPKM below the mean
print(lncRNA_comp.Value.mean())
lncRNA_comp = lncRNA_comp[lncRNA_comp.Value >= lncRNA_comp.Value.mean()].rename(columns={'Value':'FPKM'})

lncRNA_comp_rnacentral = pd.merge(lncRNA_comp, rnacentral_map_human_ensembl[['RNAcentral ID','Ensembl Gene ID']].drop_duplicates().rename(
    columns={'Ensembl Gene ID':'ENSEMBL ID'}), on='ENSEMBL ID').rename(columns={'RNAcentral ID':'RNA'})
lncRNA_comp_ensembl = pd.merge(lncRNA_comp, ensembl_map[['ensembl_gene_id','transcript_stable_id']].drop_duplicates().rename(
    columns={'ensembl_gene_id':'ENSEMBL ID'}), on='ENSEMBL ID').drop(columns=['ENSEMBL ID']).rename(columns={'transcript_stable_id':'RNA'})
lncRNA_comp_ensembl = lncRNA_comp_ensembl[~lncRNA_comp_ensembl['RNA'].isin(lncRNA_comp_rnacentral['ENSEMBL ID'])]
lncRNA_comp_rnacentral.drop(columns=['ENSEMBL ID'],inplace=True)

lncRNA_comp = pd.concat([lncRNA_comp_ensembl, lncRNA_comp_rnacentral]).rename(columns={'RNA':':START_ID','GO':':END_ID'})

lncRNA_comp['Data Source'] = lncRNA_comp['Data Source'].str.lower()
lncRNA_comp = pd.merge(lncRNA_comp, disease_map, right_on='0_y', left_on='Data Source', how='left')
lncRNA_comp['0_x'] = lncRNA_comp['0_x'].fillna(lncRNA_comp['Data Source'])
lncRNA_comp = lncRNA_comp.drop(columns=['0_y', 'Data Source'])
lncRNA_comp = lncRNA_comp.rename(columns={'0_x':'Location'})

lncRNA_comp['Source'] = 'lncATLAS'
RNA_located_in_OBO12 = lncRNA_comp.copy()
RNA_located_in_OBO12.head(n=3)

* [LncBase](https://diana.e-ce.uth.gr/lncbasev3/home)

In [None]:
# https://diana.e-ce.uth.gr/lncbasev3/interactions --> check all filters --> "Download" button --> CSV -->
# --> Give consent --> Download file via link received by e-mail

In [None]:
lncRNA_comp2 = pd.read_csv(unprocessed_data_location + 'lncbase_lncrna-CC.csv', sep="\t")# GO
lncRNA_comp2['Compartment'] = lncRNA_comp2['Compartment'].replace({'Nucleus': 'GO_0005634', 'Cytoplasm': 'GO_0005737'})
#lncRNA_comp2 = pd.merge(lncRNA_comp2, symbol_entrez_map.rename(columns={'0_x':'Gene Name'}), on='Gene Name')
lncRNA_comp2.drop(columns=['Gene Name','Gene Id','Nucleus TPM','Cytoplasm TPM','Replicates'],inplace=True)
lncRNA_comp2['RCI'] = abs(lncRNA_comp2.RCI)
print(lncRNA_comp2.RCI.mean())
lncRNA_comp2 = lncRNA_comp2[lncRNA_comp2.RCI >= lncRNA_comp2.RCI.mean()]
print(lncRNA_comp2['Transcript Id'].str[:4].unique())
lncRNA_comp2['Transcript Id'] = lncRNA_comp2['Transcript Id'].str.split('.').str[0]
lncRNA_comp2['Source'] = 'LncBase'

lncRNA_comp2_ensembl = pd.merge(lncRNA_comp2, pd.concat([rnacentral_map_human_refseq[['RNAcentral ID','RefSeq ID']],
                                                     rnacentral_map_human_refseq[['RNAcentral ID','Label']].rename(
                                                         columns={'Label':'RefSeq ID'})]).drop_duplicates().dropna().rename(
                                                             columns={'RNAcentral ID':'Transcript Id'}),
                                                                on='Transcript Id').drop(columns=['RefSeq ID'])
lncRNA_comp2_ensembl.rename(columns={'Transcript Id':'RNA'},inplace=True)

lncRNA_comp2_ensembl['Tissue'] = lncRNA_comp2_ensembl['Tissue'].str.lower()
lncRNA_comp2_ensembl['Cell Type'] = lncRNA_comp2_ensembl['Cell Type'].str.lower()

lncRNA_comp2_ensembl = pd.concat([lncRNA_comp2_ensembl.drop(columns=['Cell Type']),lncRNA_comp2_ensembl.drop(
    columns=['Tissue']).rename(columns={'Cell Type':'Tissue'})])

lncRNA_comp2_ensembl['Category'] = lncRNA_comp2_ensembl['Category'].str.split(",")
lncRNA_comp2_ensembl = lncRNA_comp2_ensembl.explode('Category')
print(lncRNA_comp2_ensembl['Category'].unique())

lncRNA_comp2_ensembl['Tissue'] = lncRNA_comp2_ensembl['Tissue'].str.lower()
lncRNA_comp2_ensembl = pd.merge(lncRNA_comp2_ensembl, disease_map, right_on='0_y', left_on='Tissue', how='left')
lncRNA_comp2_ensembl['0_x'] = lncRNA_comp2_ensembl['0_x'].fillna(lncRNA_comp2_ensembl['Tissue'])
lncRNA_comp2_ensembl = lncRNA_comp2_ensembl.drop(columns=['0_y', 'Tissue'])
lncRNA_comp2_ensembl = lncRNA_comp2_ensembl.rename(columns={'0_x':'Location'})

lncRNA_comp2_ensembl['Category'] = lncRNA_comp2_ensembl['Category'].str.replace('Cancer/Malignant','cancer cell line')
lncRNA_comp2_ensembl['Category'] = lncRNA_comp2_ensembl['Category'].str.replace('Stem/Progenitor','stem cell line cell')
lncRNA_comp2_ensembl['Category'] = lncRNA_comp2_ensembl['Category'].str.replace('Normal/Primary','primary cultured cell')
lncRNA_comp2_ensembl['Category'] = lncRNA_comp2_ensembl['Category'].str.replace('Embryonic/Fetal','embryonic stem cell line cell')

lncRNA_comp2_ensembl = pd.concat([lncRNA_comp2_ensembl.drop(columns=['Location']).rename(columns={'Category':'Location'}),
                                  lncRNA_comp2_ensembl.drop(columns=['Category'])])
lncRNA_comp2_ensembl.rename(columns={'RNA':':START_ID','Compartment':':END_ID','Category':'Location'},inplace=True)
RNA_located_in_OBO13 = lncRNA_comp2_ensembl.copy()
RNA_located_in_OBO13.head(n=3)

* [tRFdb](http://genome.bioch.virginia.edu/trfdb/index.php)

In [None]:
#http://genome.bioch.virginia.edu/trfdb/index.php --> download webpage
tRF1_tRNA = pd.read_html(unprocessed_data_location+'trf1.html')[2] # tRFdb -- CLO+Uberon
tRF1_tRNA.drop(columns=['Organism'],inplace=True)
tRF3_tRNA = pd.read_html(unprocessed_data_location+'trf3.html')[2]
tRF3_tRNA.drop(columns=['Organism'],inplace=True)
tRF5_tRNA = pd.read_html(unprocessed_data_location+'trf5.html')[2]
tRF5_tRNA.drop(columns=['Organism'],inplace=True)

tRF_tRNA = pd.concat([tRF1_tRNA,tRF3_tRNA,tRF5_tRNA])
tRF_tRNA = tRF_tRNA.drop(columns=['Experiment Info', 'Sequence'])
tRF_tRNA['tRF ID'] = tRF_tRNA['tRF ID'].astype(str)
tRF_tRNA.head(n=3)

In [None]:
df = pd.DataFrame()

for tRF_ID in tRF_tRNA['tRF ID']:
    url = f"http://genome.bioch.virginia.edu/trfdb/experiments_display.php?trf_id={tRF_ID}&organism=human"
    response = requests.get(url)
    temp = pd.read_html(response.content)[0].drop(columns=['GEO / SRA Links','View Alignment','Graph Alignment'])
    temp['tRF ID'] = tRF_ID
    df = pd.concat([df, temp], ignore_index=True)
df.Source = df.Source.str.lower().str.replace('-', ' ').str.replace('normal ', '').str.replace('whole', '')
df['Source'] = df['Source'].apply(lambda x: x + ' cell' if not x.endswith(' cell') else x)
df = df[df['Abundance'] != 0]
df.to_csv(unprocessed_data_location+'trf_location.csv', index=False)
df.head(n=3)

In [None]:
df.Source.unique()[:3]

In [None]:
df_gsm = df[df['Experiment'].str.startswith('GSM')]
df_sra = df[df['Experiment'].str.startswith('SR')]
df_gsm['Experiment'] = "NCBI/geo/query/acc.cgi?acc=" + df_gsm['Experiment']
df_sra['Experiment'] = "NCBI/sra/query/acc.cgi?acc=" + df_sra['Experiment']
df = pd.concat([df_gsm, df_sra])
df = pd.merge(df.rename(columns={'Source':0}), desc_anatomyCell_map).drop(columns=[0])

df.rename(columns={1:':END_ID', 'tRF ID':':START_ID', 'Experiment':'Location'}, inplace=True)
df[':START_ID'] = "trfdb?" + df[':START_ID'].astype(str)
df['Source'] = 'tRFdb'
RNA_located_in_OBO14 = df.copy()
RNA_located_in_OBO14.head(n=3)

* [Vesiclepedia](http://microvesicles.org/index.html) <br/> Vesiclepedia is a a manually curated compendium of molecular data (lipid, RNA and protein) identified in different classes of extracellular vesicles. 

In [None]:
!wget http://microvesicles.org/Archive/VESICLEPEDIA_EXPERIMENT_DETAILS_5.1.txt -O ../resources/processed_data/unprocessed_data/VESICLEPEDIA_EXPERIMENT_DETAILS_5.1.txt
!wget http://microvesicles.org/Archive/VESICLEPEDIA_PROTEIN_MRNA_DETAILS_5.1.txt o ../resources/processed_data/unprocessed_data/VESICLEPEDIA_PROTEIN_MRNA_DETAILS_5.1.txt

In [None]:
experiments = pd.read_csv(unprocessed_data_location+'VESICLEPEDIA_EXPERIMENT_DETAILS_5.1.txt', sep='\t')
experiments = experiments[experiments['SPECIES'].str.contains('apiens')]

protein_ev = pd.read_csv(unprocessed_data_location+'VESICLEPEDIA_PROTEIN_MRNA_DETAILS_5.1.txt', sep='\t')
protein_ev = protein_ev[protein_ev['SPECIES'].str.contains('apiens')]
protein_ev = pd.merge(protein_ev, experiments, on=['EXPERIMENT ID'])

protein_ev['PUBMED ID'] = pd.to_numeric(protein_ev['PUBMED ID'], errors='coerce')
protein_ev['PUBMED ID'] = protein_ev['PUBMED ID'].replace(0, np.nan)
protein_ev['PUBMED ID'] = pd.to_numeric(protein_ev['PUBMED ID'], errors='coerce')
protein_ev['PUBMED ID'] = protein_ev['PUBMED ID'].astype(str).str.replace(".0", "")
protein_ev['PUBMED ID'] = protein_ev['PUBMED ID'].replace("<NA>", np.nan)
protein_ev['PUBMED ID'] = protein_ev['PUBMED ID'].replace("nan", np.nan)

protein_ev['ISOLATION METHOD'] = protein_ev['ISOLATION METHOD'].str.lower()
protein_ev['ISOLATION METHOD'] = protein_ev['ISOLATION METHOD'].str.split("|")
protein_ev = protein_ev.explode('ISOLATION METHOD')
protein_ev = pd.merge(protein_ev, method_map, right_on='0_y', left_on='ISOLATION METHOD', how='left')
protein_ev['0_x'] = protein_ev['0_x'].fillna(protein_ev['ISOLATION METHOD'])
protein_ev = protein_ev.drop(columns=['0_y', 'ISOLATION METHOD'])
protein_ev = protein_ev.rename(columns={'0_x':'Method','PUBMED ID':'PubMedID'})

protein_ev['SAMPLE'] = protein_ev['SAMPLE'].str.lower()
protein_ev = pd.merge(protein_ev, location_map, right_on='0_y', left_on='SAMPLE', how='left')
protein_ev['0_x'] = protein_ev['0_x'].fillna(protein_ev['SAMPLE'])
protein_ev = protein_ev.drop(columns=['0_y', 'SAMPLE'])
protein_ev = protein_ev.rename(columns={'0_x':'Location'})

protein_ev = protein_ev[["CONTENT TYPE","ENTREZ GENE ID","PubMedID","Location","Method","VESICLE TYPE"]]
print(protein_ev['VESICLE TYPE'].unique())
protein_ev.head(n=3)

In [None]:
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace(r".*xosomes.*", "GO_0070062", regex=True)
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace("Membrane blebs", "GO_0032059")
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace("Apoptotic bodies", "GO_0097189")
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace(r".*embrane", "GO_0016020", regex=True)
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace(r".*icrovesicles", "GO_1990742", regex=True)
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace(r"^(?!GO_.*)", "GO_1990742", regex=True)
protein_ev['VESICLE TYPE'] = 'GO_'+protein_ev['VESICLE TYPE'].str.extract('(\d+)', expand=False)
protein_ev.head(n=3)

In [None]:
protein_ev['CONTENT TYPE'].unique()

In [None]:
mRNA_ev = protein_ev[(protein_ev['CONTENT TYPE']=='mRNA') | (protein_ev['CONTENT TYPE']=='mrna')]
mRNA_ev.drop(columns=['CONTENT TYPE'], inplace=True)
mRNA_ev = mRNA_ev[(mRNA_ev['ENTREZ GENE ID'].notna())]
mRNA_ev = mRNA_ev[(mRNA_ev['ENTREZ GENE ID'].astype(str).str[0].str.isdigit())]
ensembl_mrna = ensembl_entrezTranscript_map[[0,1,2]]
ensembl_mrna = ensembl_mrna[ensembl_mrna[2] == "protein-coding"].drop_duplicates()
mRNA_ev = pd.merge(mRNA_ev, ensembl_mrna, left_on='ENTREZ GENE ID', right_on=0).drop(columns=["ENTREZ GENE ID",0,2])

mRNA_ev = mRNA_ev.rename(columns={'VESICLE TYPE':':END_ID', 1:':START_ID'})
mRNA_ev['Source'] = 'Vesiclepedia'
RNA_located_in_OBO15 = mRNA_ev.copy()
RNA_located_in_OBO15.head(n=3)

In [None]:
snRNA_ev = protein_ev[protein_ev['CONTENT TYPE']=='snrna']
snRNA_ev.drop(columns=['CONTENT TYPE'], inplace=True)
snRNA_ev = snRNA_ev[(snRNA_ev['ENTREZ GENE ID'].notna())]
snRNA_ev = snRNA_ev[(snRNA_ev['ENTREZ GENE ID'].astype(str).str[0].str.isdigit())]

ensembl_snrna = ensembl_entrezTranscript_map[[0,1,2]]
ensembl_snrna = ensembl_snrna[ensembl_snrna[2] == "snRNA"].drop_duplicates()
snRNA_ev = pd.merge(snRNA_ev, ensembl_snrna, left_on='ENTREZ GENE ID', right_on=0).drop(columns=["ENTREZ GENE ID",0,2])

snRNA_ev['Source'] = 'Vesiclepedia'

snRNA_ev_rnacentral = pd.merge(snRNA_ev, rnacentral_map_human_ensembl[['RNAcentral ID','Ensembl transcript ID']].drop_duplicates(),
                               left_on=1, right_on='Ensembl transcript ID').drop(columns=['Ensembl transcript ID'])
snRNA_ev_ensembl = snRNA_ev[~snRNA_ev[1].isin(snRNA_ev_rnacentral[1])]
snRNA_ev_rnacentral.drop(columns=[1],inplace=True)

print(snRNA_ev_ensembl.head(n=3)) # Empty
snRNA_ev_rnacentral.head(n=3) # Empty
RNA_located_in_OBO16 = pd.concat([snRNA_ev_ensembl, snRNA_ev_rnacentral]).copy()
RNA_located_in_OBO16 = RNA_located_in_OBO16.rename(columns={'RNAcentral ID':':START_ID','VESICLE TYPE':':END_ID'})
RNA_located_in_OBO16.head(n=3)

In [None]:
!wget http://microvesicles.org/Archive/VESICLEPEDIA_MIRNA_DETAILS_5.1.txt -O ../resources/processed_data/unprocessed_data/VESICLEPEDIA_MIRNA_DETAILS_5.1.txt

In [None]:
miRNA_ev = pd.read_csv(unprocessed_data_location+'VESICLEPEDIA_MIRNA_DETAILS_5.1.txt', sep='\t')

miRNA_ev = miRNA_ev[(miRNA_ev['SPECIES'].notna()) & (miRNA_ev['SPECIES'].str.contains('apiens'))]
miRNA_ev = miRNA_ev[(miRNA_ev['COMMENTS'] == 'Clear hit to Entrez gene ID')]
miRNA_ev = pd.merge(miRNA_ev, experiments, on=['EXPERIMENT ID'])

miRNA_ev['MIRNA ID'] = 'hsa-' + miRNA_ev['MIRNA ID'].astype(str)
miRNA_ev['VESICLE TYPE'] = miRNA_ev['VESICLE TYPE'].str.replace(r".*xosomes.*", "GO_0070062", regex=True)
miRNA_ev['VESICLE TYPE'] = miRNA_ev['VESICLE TYPE'].str.replace("Membrane blebs", "GO_0032059")
miRNA_ev['VESICLE TYPE'] = miRNA_ev['VESICLE TYPE'].str.replace("Apoptotic bodies", "GO_0097189")
miRNA_ev['VESICLE TYPE'] = miRNA_ev['VESICLE TYPE'].str.replace(r".*embrane", "GO_0016020", regex=True)
miRNA_ev['VESICLE TYPE'] = miRNA_ev['VESICLE TYPE'].str.replace(r".*icrovesicles", "GO_1990742", regex=True)
miRNA_ev['VESICLE TYPE'] = miRNA_ev['VESICLE TYPE'].str.replace(r"^(?!GO_.*)", "GO_1990742", regex=True)
miRNA_ev['VESICLE TYPE'] = 'GO_'+miRNA_ev['VESICLE TYPE'].str.extract('(\d+)', expand=False)

miRNA_ev['PUBMED ID'] = pd.to_numeric(miRNA_ev['PUBMED ID'], errors='coerce')
miRNA_ev['PUBMED ID'] = miRNA_ev['PUBMED ID'].replace(0, np.nan)
miRNA_ev['PUBMED ID'] = pd.to_numeric(miRNA_ev['PUBMED ID'], errors='coerce')
miRNA_ev['PUBMED ID'] = miRNA_ev['PUBMED ID'].astype(str).str.replace(".0", "")
miRNA_ev['PUBMED ID'] = miRNA_ev['PUBMED ID'].replace("<NA>", np.nan)
miRNA_ev['PUBMED ID'] = miRNA_ev['PUBMED ID'].replace("nan", np.nan)

miRNA_ev['ISOLATION METHOD'] = miRNA_ev['ISOLATION METHOD'].str.lower()
miRNA_ev['ISOLATION METHOD'] = miRNA_ev['ISOLATION METHOD'].str.split("|")
miRNA_ev = miRNA_ev.explode('ISOLATION METHOD')
miRNA_ev = pd.merge(miRNA_ev, method_map, right_on='0_y', left_on='ISOLATION METHOD', how='left')
miRNA_ev['0_x'] = miRNA_ev['0_x'].fillna(miRNA_ev['ISOLATION METHOD'])
miRNA_ev = miRNA_ev.drop(columns=['0_y', 'ISOLATION METHOD'])
miRNA_ev = miRNA_ev.rename(columns={'0_x':'Method','PUBMED ID':'PubMedID'})

miRNA_ev['SAMPLE'] = miRNA_ev['SAMPLE'].str.lower()
miRNA_ev = pd.merge(miRNA_ev, location_map, right_on='0_y', left_on='SAMPLE', how='left')
miRNA_ev['0_x'] = miRNA_ev['0_x'].fillna(miRNA_ev['SAMPLE'])
miRNA_ev = miRNA_ev.drop(columns=['0_y', 'SAMPLE'])
miRNA_ev = miRNA_ev.rename(columns={'0_x':'Location'})

miRNA_ev = miRNA_ev[["CONTENT TYPE","MIRNA ID", "PubMedID", "Location","Method","VESICLE TYPE"]]
miRNA_ev.head(n=3)

In [None]:
print(miRNA_ev[~miRNA_ev['MIRNA ID'].isin(rnacentral_map_human['DB Description'])]['MIRNA ID'].str[:3].unique())
miRNA_RNA_miRNAnotInRNAcentral3p = miRNA_ev[~miRNA_ev['MIRNA ID'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p = miRNA_ev[~miRNA_ev['MIRNA ID'].isin(rnacentral_map_human['DB Description'])]
miRNA_RNA_miRNAnotInRNAcentral5p['MIRNA ID'] = miRNA_RNA_miRNAnotInRNAcentral5p['MIRNA ID'].astype(str) + '-3p'
miRNA_RNA_miRNAnotInRNAcentral3p['MIRNA ID'] = miRNA_RNA_miRNAnotInRNAcentral3p['MIRNA ID'].astype(str) + '-5p'
miRNA_RNA_miRNAnotInRNAcentral = pd.concat([miRNA_RNA_miRNAnotInRNAcentral3p, miRNA_RNA_miRNAnotInRNAcentral5p])
miRNA_RNA_miRNAnotInRNAcentral = pd.merge(miRNA_RNA_miRNAnotInRNAcentral, rnacentral_map_human.rename(
    columns={'DB Description':'MIRNA ID'}), on='MIRNA ID').drop(columns=['MIRNA ID']).rename(columns={'RNAcentral ID':'RNA'})

miRNA_ev = pd.merge(miRNA_ev, rnacentral_map_human.rename(columns={'DB Description':'MIRNA ID'}), on='MIRNA ID')
miRNA_ev = pd.concat([miRNA_ev.rename(columns={'RNAcentral ID':'RNA'}),
                       miRNA_RNA_miRNAnotInRNAcentral]).drop(columns=['DB', 'Organism', 'RNA category','DB ID','MIRNA ID','CONTENT TYPE'])

miRNA_ev = miRNA_ev.rename(columns={'VESICLE TYPE':':END_ID', 'RNA':':START_ID'})
miRNA_ev['Source'] = 'Vesiclepedia'
RNA_located_in_OBO17 = miRNA_ev.copy()
RNA_located_in_OBO17.head(n=3)

* [Rfam](http://rfamlive.xfam.org/) <br /> The Rfam database is a collection of RNA families, each represented by multiple sequence alignments, consensus secondary structures and covariance models.

In [None]:
rfam_go = pd.read_csv(unprocessed_data_location + 'rnacentral_rfam_annotations.tsv.gz', sep='\t',
                               names=['RNAcentral ID', "GO", "Rfam ID"]).drop(columns=['Rfam ID'])
rfam_go = rfam_go[rfam_go['RNAcentral ID'].str.endswith('_9606')]
rfam_go['RNAcentral ID'] = rfam_go['RNAcentral ID'].str.split('_').str[0]
rfam_go['GO'] = rfam_go['GO'].str.replace(':','_')
rfam_go['Source'] = 'Rfam, RNAcentral'
rfam_go['Source'] = rfam_go['Source'].str.split(", ")
rfam_go = rfam_go.explode('Source')
rfam_go.head(n=3)
dict = []
goterms_in_rfam = rfam_go['GO'].unique()
for term in goterms_in_rfam:
    aspect = pd.read_json("https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/" + term.replace("_",":") + "/complete")['results'][0]
    dict.append(aspect.get("aspect"))
goterms_in_rfam_map_relation = pd.DataFrame({'GO':goterms_in_rfam, 'Aspect':dict})

rfam_go = pd.merge(rfam_go, goterms_in_rfam_map_relation, on='GO')
rfam_go.rename(columns={'RNAcentral ID':':START_ID','GO':':END_ID'},inplace=True)
rfam_gobp = rfam_go[rfam_go['Aspect'] =='biological_process'].drop(columns=['Aspect'])
rfam_gomf = rfam_go[rfam_go['Aspect'] =='molecular_function'].drop(columns=['Aspect'])
RNA_located_in_OBO18 = rfam_go[rfam_go['Aspect'] =='cellular_component'].drop(columns=['Aspect'])

RNA_located_in_OBO18.head(n=3)#1025

In [None]:
RNA_located_in_OBO = pd.concat([RNA_located_in_OBO1,RNA_located_in_OBO2,RNA_located_in_OBO3,
                                RNA_located_in_OBO4,RNA_located_in_OBO5,RNA_located_in_OBO6,
                                RNA_located_in_OBO7,RNA_located_in_OBO8,RNA_located_in_OBO9,
                                RNA_located_in_OBO10,RNA_located_in_OBO11,RNA_located_in_OBO12,
                                RNA_located_in_OBO13,RNA_located_in_OBO14,RNA_located_in_OBO15,
                                RNA_located_in_OBO16,RNA_located_in_OBO17,RNA_located_in_OBO18])
RNA_located_in_OBO = RNA_located_in_OBO.groupby([':START_ID',':END_ID']).agg({'Source':set,'TPM':np.mean,'PubMedID':set,
                                                                              'RNAsister_score':np.mean,'Method':set,	
                                                                              'Location':set,'FPKM':np.mean,
                                                                              'RCI':np.mean,'Abundance':np.mean,}).reset_index()
RNA_located_in_OBO[":TYPE"] = "located_in"
RNA_located_in_OBO.to_pickle(unprocessed_edge_data_location+'RNA_located_in_OBO.pkl')

OBO_location_of_RNA = RNA_located_in_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_location_of_RNA[":TYPE"] = "location_of"
OBO_location_of_RNA.to_pickle(unprocessed_edge_data_location+'OBO_location_of_RNA.pkl')
OBO_location_of_RNA.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0001025 (located in) - OBO

* The GO consortium

In [None]:
data_downloader("http://current.geneontology.org/annotations/goa_human.gaf.gz", unprocessed_data_location)

In [None]:
pro_go = pd.read_csv(unprocessed_data_location+"goa_human.gaf",comment="!",sep="\t",header=None)
pro_go = pro_go[pro_go[12] == 'taxon:9606']
pro_go[14] = 'GO_Central, ' + pro_go[14]
pro_go = pro_go[[1,3,4,6,14]]
pro_go[4] = pro_go[4].str.replace('GO:','GO_')
pro_go = pd.merge(pro_go, unipro_pro_map, left_on=1, right_on=0).drop(columns=[1,'1_x',0])
pro_go[14] = pro_go[14].str.replace(", UniProt",", UniProtKB")
pro_go[14] = pro_go[14].str.replace(", IntAct","")
pro_go[14] = pro_go[14].str.replace(", HPA",", The_Human_Protein_Atlas")
pro_go[14] = pro_go[14].str.replace(", GOC","")
pro_go[14] = pro_go[14].str.replace(", FlyBase","")
pro_go[14] = pro_go[14].str.replace(", NTNU_SB","")
pro_go[14] = pro_go[14].str.replace(", ComplexPortal","")
pro_go[14] = pro_go[14].str.replace(", ParkinsonsUK-UCL","")
pro_go[14] = pro_go[14].str.replace(", ARUK-UCL","")
pro_go[14] = pro_go[14].str.replace(", LIFEdb","")
pro_go[14] = pro_go[14].str.replace(", BHF-UCL","")
pro_go[14] = pro_go[14].str.replace(", MGI","")
pro_go[14] = pro_go[14].str.replace(", RHEA","")
pro_go[14] = pro_go[14].str.replace(", HGNC-UCL",", HGNC")
pro_go[14] = pro_go[14].str.replace(", SYSCILIA_CCNET","")
pro_go[14] = pro_go[14].str.replace(", CACAO","")
pro_go[14] = pro_go[14].str.replace(", AgBase","")
pro_go[14] = pro_go[14].str.replace(", PINC","")
pro_go[14] = pro_go[14].str.replace(", CAFA","")
pro_go[14] = pro_go[14].str.replace(", DisProt","")
pro_go[14] = pro_go[14].str.replace(", MTBBASE","")
pro_go[14] = pro_go[14].str.replace(", YuBioLab","")
pro_go[14] = pro_go[14].str.replace(", SynGO","")
pro_go[14] = pro_go[14].str.replace(", Alzheimers_University_of_Toronto","")
pro_go[14] = pro_go[14].str.replace(", GDB","")
pro_go[14] = pro_go[14].str.replace(", SynGO-UCL","")
pro_go[14] = pro_go[14].str.replace(", DFLAT","")
pro_go[14] = pro_go[14].str.replace(", DIBU","")
pro_go[14] = pro_go[14].str.replace(", PHI-base","")
pro_go[14] = pro_go[14].str.replace(", WB","")
pro_go[14] = pro_go[14].str.replace(", Xenbase","")
pro_go[14] = pro_go[14].str.replace(", ZFIN","")
pro_go[14] = pro_go[14].str.replace(", dictyBase","")
pro_go[14] = pro_go[14].str.replace(", InterPro","")
pro_go[14] = pro_go[14].str.replace("GO_Central-UCL","GO_Central")
pro_go[14] = pro_go[14].str.split(", ")
pro_go = pro_go.explode(14)
pro_go[14] = pro_go[14].str.replace("GO_Central","GOC")
pro_go = pro_go.rename(columns={'1_y':':START_ID',4:':END_ID',6:'GO_evidence',14:'Source'})

pro_go_dict = {}
for i in pro_go[3].unique():
    pro_go_dict[f'pro_go_{i}'] = pro_go[pro_go[3] == i].drop(columns=[3])
print(pro_go[3].unique())
pro_go_dict['pro_go_located_in'].head(n=3)

In [None]:
pro_go['Source'].unique()

* Human Protein Atlas

In [None]:
hpa = pd.read_csv(unprocessed_data_location + 'proteinatlas_search.tsv', header=0, delimiter='\t')
hpa.fillna('nan', inplace=True)
hpa.head(n=1)

In [None]:
hpa_results = []
for idx, row in tqdm(hpa.iterrows(), total=hpa.shape[0]):
    ens, gene, uniprot, evid = str(row['Ensembl']), str(row['Gene']), str(row['Uniprot']), str(row['Evidence'])
    if row['RNA tissue specific nTPM'] != 'nan':
        for x in row['RNA tissue specific nTPM'].split(';'):
            hpa_results += [[ens, gene, uniprot, evid, 'anatomy', str(x.split(':')[0]), str(x.split(': ')[1])]]
    if row['RNA cell line specific nTPM'] != 'nan':
        for x in row['RNA cell line specific nTPM'].split(';'):
            hpa_results += [[ens, gene, uniprot, evid, 'cell line', str(x.split(':')[0]), str(x.split(': ')[1])]]
    if row['RNA brain regional specific nTPM'] != 'nan':
        for x in row['RNA brain regional specific nTPM'].split(';'):
            hpa_results += [[ens, gene, uniprot, evid, 'anatomy', str(x.split(':')[0]), str(x.split(': ')[1])]]
    if row['RNA blood cell specific nTPM'] != 'nan':
        for x in row['RNA blood cell specific nTPM'].split(';'):
            hpa_results += [[ens, gene, uniprot, evid, 'anatomy', str(x.split(':')[0]), str(x.split(': ')[1])]]
    if row['RNA blood lineage specific nTPM'] != 'nan':
        for x in row['RNA blood lineage specific nTPM'].split(';'):
            hpa_results += [[ens, gene, uniprot, evid, 'anatomy', str(x.split(':')[0]), str(x.split(': ')[1])]]

In [None]:
with open(unprocessed_data_location + 'HPA_PROTEIN_EDGES.txt', 'w') as out:
    for x in tqdm(hpa_results):
        out.write(x[0] + '\t' + x[1] + '\t' + x[2] + '\t' + x[3] + '\t' + x[4] + '\t' + x[5] + '\t' + x[6] + '\n')

In [None]:
# load data, return edge count, and preview it
hpa_edges = pd.read_csv(unprocessed_data_location + 'HPA_PROTEIN_EDGES.txt',
                           header=None, low_memory=False, sep='\t',
                           names=['Ensembl_IDs', 'Gene_Symbols', 'Uniprot_IDs', 'Evidence',
                                   'Anatomy_Type', 'Anatomy', 'TPM'])

hpa_edges.head(n=3)

In [None]:
hpa_rna_edges = hpa_edges[['Uniprot_IDs', 'Evidence', 'Anatomy', 'TPM']]
hpa_rna_edges = hpa_rna_edges[hpa_rna_edges['Evidence'] == 'Evidence at protein level'].drop(columns=['Evidence'])
hpa_rna_edges.head(n=3)

In [None]:
hpa_rna_edges = pd.merge(hpa_rna_edges, unipro_pro_map.rename(columns={0:'Uniprot_IDs'}), on='Uniprot_IDs').drop(columns=['Uniprot_IDs'])
hpa_rna_edges = pd.merge(hpa_rna_edges, hpa_gtex_map.rename(columns={0:'Anatomy'}), on='Anatomy').drop(columns=['Anatomy'])

hpa_rna_edges['Source'] = 'The_Human_Protein_Atlas'
pro_located_in2 = hpa_rna_edges.copy()
pro_located_in2 = pro_located_in2.rename(columns={'1_x':':START_ID','1_y':':END_ID'})
pro_located_in2.head(n=3)

* GTEx

In [None]:
gtex = pd.read_csv(unprocessed_data_location + 'GTEx_Analysis_v10_RNASeQCv2.4.2_gene_median_tpm.gct', header=0, skiprows=2, delimiter='\t')
gtex.fillna('nan', inplace=True) 
gtex['Name'].replace('(\..*)','', inplace=True, regex=True)
gtex.head(n=3)

In [None]:
# remove rows that contain protein coding genes already in the hpa data
hpa_genes = list(hpa['Ensembl'].drop_duplicates(keep='first', inplace=False))
gtex = gtex.loc[gtex['Name'].apply(lambda x: x not in hpa_genes)]

# loop over data and re-organize - only keep results with tpm >= 1 and if gene symbol is not a protein-coding gene
gtex_results = []
for idx, row in tqdm(gtex.iterrows(), total=gtex.shape[0]):
    for col in list(gtex.columns)[2:]:
        typ = 'cell line' if 'Cells' in col else 'anatomy'
        if row[col] >= 1.0:
            evidence = 'Evidence at transcript level'
            gtex_results += [[str(row['Name']), str(row['Description']), 'nan', evidence, typ, str(col), str(row[col])]]

In [None]:
with open(unprocessed_data_location + 'GTEX_PROTEIN_EDGES.txt', 'w') as out:
    for x in tqdm(gtex_results):
        out.write(x[0] + '\t' + x[1] + '\t' + x[2] + '\t' + x[3] + '\t' + x[4] + '\t' + x[5] + '\t' + x[6] + '\n')

In [None]:
# load data, return edge count, and preview it
hpa_edges = pd.read_csv(unprocessed_data_location + 'GTEX_PROTEIN_EDGES.txt',
                           header=None, low_memory=False, sep='\t',
                           names=['Ensembl_IDs', 'Gene_Symbols', 'Uniprot_IDs', 'Evidence',
                                   'Anatomy_Type', 'Anatomy', 'TPM'])

hpa_edges.head(n=3)

In [None]:
hpa_rna_edges = hpa_edges[['Gene_Symbols', 'Evidence', 'Anatomy', 'TPM']]
hpa_rna_edges = hpa_rna_edges[hpa_rna_edges['Evidence'] == 'Evidence at protein level'].drop(columns=['Evidence'])
hpa_rna_edges = pd.merge(hpa_rna_edges, symbol_to_pro.rename(columns={0:'Gene_Symbols'}), on='Gene_Symbols').drop(columns=['Gene_Symbols'])
hpa_rna_edges = pd.merge(hpa_rna_edges, hpa_gtex_map.rename(columns={0:'Anatomy'}), on='Anatomy').drop(columns=['Anatomy'])

hpa_rna_edges['Source'] = 'GTEx'
pro_located_in3 = hpa_rna_edges.copy()
pro_located_in3.head(n=3) # Empty

* [Vesiclepedia](http://microvesicles.org/index.html) <br/> Vesiclepedia is a a manually curated compendium of molecular data (lipid, RNA and protein) identified in different classes of extracellular vesicles. 

In [None]:
!wget http://microvesicles.org/Archive/VESICLEPEDIA_EXPERIMENT_DETAILS_5.1.txt -O ../resources/processed_data/unprocessed_data/VESICLEPEDIA_EXPERIMENT_DETAILS_5.1.txt
!wget http://microvesicles.org/Archive/VESICLEPEDIA_PROTEIN_MRNA_DETAILS_5.1.txt -O ../resources/processed_data/unprocessed_data/VESICLEPEDIA_PROTEIN_MRNA_DETAILS_5.1.txt

In [None]:
experiments = pd.read_csv(unprocessed_data_location+'VESICLEPEDIA_EXPERIMENT_DETAILS_5.1.txt', sep='\t')
experiments = experiments[experiments['SPECIES'].str.contains('apiens')]

protein_ev = pd.read_csv(unprocessed_data_location+'VESICLEPEDIA_PROTEIN_MRNA_DETAILS_5.1.txt', sep='\t')
protein_ev = protein_ev[protein_ev['SPECIES'].str.contains('apiens')]
protein_ev = pd.merge(protein_ev, experiments, on=['EXPERIMENT ID'])

protein_ev['PUBMED ID'] = pd.to_numeric(protein_ev['PUBMED ID'], errors='coerce')
protein_ev['PUBMED ID'] = protein_ev['PUBMED ID'].replace(0, np.nan)
protein_ev['PUBMED ID'] = pd.to_numeric(protein_ev['PUBMED ID'], errors='coerce')
protein_ev['PUBMED ID'] = protein_ev['PUBMED ID'].astype(str).str.replace(".0", "")
protein_ev['PUBMED ID'] = protein_ev['PUBMED ID'].replace("<NA>", np.nan)
protein_ev['PUBMED ID'] = protein_ev['PUBMED ID'].replace("nan", np.nan)

protein_ev['ISOLATION METHOD'] = protein_ev['ISOLATION METHOD'].str.lower()
protein_ev['ISOLATION METHOD'] = protein_ev['ISOLATION METHOD'].str.split("|")
protein_ev = protein_ev.explode('ISOLATION METHOD')
protein_ev = pd.merge(protein_ev, method_map, right_on='0_y', left_on='ISOLATION METHOD', how='left')
protein_ev['0_x'] = protein_ev['0_x'].fillna(protein_ev['ISOLATION METHOD'])
protein_ev = protein_ev.drop(columns=['0_y', 'ISOLATION METHOD'])
protein_ev = protein_ev.rename(columns={'0_x':'Method','PUBMED ID':'PubMedID'})
protein_ev['Method'] = protein_ev['Method'].replace("-",np.nan)

protein_ev['SAMPLE'] = protein_ev['SAMPLE'].str.lower()
protein_ev = pd.merge(protein_ev, location_map, right_on='0_y', left_on='SAMPLE', how='left')
protein_ev['0_x'] = protein_ev['0_x'].fillna(protein_ev['SAMPLE'])
protein_ev = protein_ev.drop(columns=['0_y', 'SAMPLE'])
protein_ev = protein_ev.rename(columns={'0_x':'Location'})

protein_ev = protein_ev[["CONTENT TYPE","ENTREZ GENE ID", "PubMedID", "Location","Method","VESICLE TYPE"]]
protein_ev['VESICLE TYPE'].unique()
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace(r".*xosomes.*", "GO_0070062", regex=True)
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace("Membrane blebs", "GO_0032059")
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace("Apoptotic bodies", "GO_0097189")
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace(r".*embrane", "GO_0016020", regex=True)
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace(r".*icrovesicles", "GO_1990742", regex=True)
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace(r"^(?!GO_.*)", "GO_1990742", regex=True)
protein_ev['VESICLE TYPE'] = 'GO_'+protein_ev['VESICLE TYPE'].str.extract('(\d+)', expand=False)
print(protein_ev['CONTENT TYPE'].unique())
protein_ev.head(n=3)

In [None]:
protein_ev = protein_ev[(protein_ev['CONTENT TYPE']=='protein') | (protein_ev['CONTENT TYPE']=='protein ')]
protein_ev.drop(columns=['CONTENT TYPE'], inplace=True)
protein_ev = protein_ev[(protein_ev['ENTREZ GENE ID'].notna())]
protein_ev = protein_ev[(protein_ev['ENTREZ GENE ID'].astype(str).str[0].str.isdigit())]

entrez_pro_map[0] = entrez_pro_map[0].astype(int)

protein_ev = pd.merge(entrez_pro_map.rename(columns={0:'ENTREZ GENE ID'}), protein_ev, on=['ENTREZ GENE ID'])
protein_ev.drop(columns=['ENTREZ GENE ID'], inplace=True)

protein_ev['Source'] = 'Vesiclepedia'
protein_ev.rename(columns={1:':START_ID', 'VESICLE TYPE':':END_ID'},inplace=True)
protein_ev.head(n=3)

In [None]:
!wget http://microvesicles.org/Archive/VESICLEPEDIA_LIPID_DETAILS_5.1.txt -O ../resources/processed_data/unprocessed_data/VESICLEPEDIA_LIPID_DETAILS_5.1.txt

In [None]:
lipid_ev = pd.read_csv(unprocessed_data_location+'VESICLEPEDIA_LIPID_DETAILS_5.1.txt', sep='\t')#Lipid (ChEBI) -- GO

lipid_ev = lipid_ev[(lipid_ev['SPECIES'].notna()) & (lipid_ev['SPECIES'].str.contains('apiens'))]
lipid_ev = pd.merge(lipid_ev, experiments, on=['EXPERIMENT ID'])
lipid_ev['LIPID ID'] = lipid_ev['LIPID ID'].str.lower()

lipid_ev['PUBMED ID'] = pd.to_numeric(lipid_ev['PUBMED ID'], errors='coerce')
lipid_ev['PUBMED ID'] = lipid_ev['PUBMED ID'].replace(0, np.nan)
lipid_ev['PUBMED ID'] = pd.to_numeric(lipid_ev['PUBMED ID'], errors='coerce')
lipid_ev['PUBMED ID'] = lipid_ev['PUBMED ID'].astype(str).str.replace(".0", "")
lipid_ev['PUBMED ID'] = lipid_ev['PUBMED ID'].replace("<NA>", np.nan)
lipid_ev['PUBMED ID'] = lipid_ev['PUBMED ID'].replace("nan", np.nan)

lipid_ev['ISOLATION METHOD'] = lipid_ev['ISOLATION METHOD'].str.lower()
lipid_ev['ISOLATION METHOD'] = lipid_ev['ISOLATION METHOD'].str.split("|")
lipid_ev = lipid_ev.explode('ISOLATION METHOD')
lipid_ev = pd.merge(lipid_ev, method_map, right_on='0_y', left_on='ISOLATION METHOD', how='left')
lipid_ev['0_x'] = lipid_ev['0_x'].fillna(lipid_ev['ISOLATION METHOD'])
lipid_ev = lipid_ev.drop(columns=['0_y', 'ISOLATION METHOD'])
lipid_ev = lipid_ev.rename(columns={'0_x':'Method','PUBMED ID':'PubMedID'})
lipid_ev['Method'] = lipid_ev['Method'].replace("-",np.nan)

lipid_ev['SAMPLE'] = lipid_ev['SAMPLE'].str.lower()
lipid_ev = pd.merge(lipid_ev, location_map, right_on='0_y', left_on='SAMPLE', how='left')
lipid_ev['0_x'] = lipid_ev['0_x'].fillna(lipid_ev['SAMPLE'])
lipid_ev = lipid_ev.drop(columns=['0_y', 'SAMPLE'])
lipid_ev = lipid_ev.rename(columns={'0_x':'Location'})

lipid_ev['VESICLE TYPE'] = lipid_ev['VESICLE TYPE'].str.replace(r".*xosomes.*", "GO_0070062", regex=True)
lipid_ev['VESICLE TYPE'] = lipid_ev['VESICLE TYPE'].str.replace("Membrane blebs", "GO_0032059")
lipid_ev['VESICLE TYPE'] = lipid_ev['VESICLE TYPE'].str.replace("Apoptotic bodies", "GO_0097189")
lipid_ev['VESICLE TYPE'] = lipid_ev['VESICLE TYPE'].str.replace(r".*embrane", "GO_0016020", regex=True)
lipid_ev['VESICLE TYPE'] = lipid_ev['VESICLE TYPE'].str.replace(r".*icrovesicles", "GO_1990742", regex=True)
lipid_ev['VESICLE TYPE'] = lipid_ev['VESICLE TYPE'].str.replace(r"^(?!GO_.*)", "GO_1990742", regex=True)
lipid_ev['VESICLE TYPE'] = 'GO_'+lipid_ev['VESICLE TYPE'].str.extract('(\d+)', expand=False)

lipid_ev = pd.merge(desc_chebi_map.rename(columns={0:'LIPID ID'}), lipid_ev, on=['LIPID ID'])
lipid_ev['Source'] = 'Vesiclepedia'
lipid_ev.rename(columns={1:':START_ID', 'VESICLE TYPE':':END_ID'},inplace=True)
lipid_ev.head(n=2)

In [None]:
OBO_located_in_OBO = pd.concat([pro_go_dict['pro_go_located_in'], pro_located_in2, protein_ev, lipid_ev])
OBO_located_in_OBO = OBO_located_in_OBO.groupby([':START_ID',':END_ID']).agg({'Source':set,'TPM':np.mean, 'PubMedID':set,
                                                                              'Location':set, 'Method':set}).reset_index()
OBO_located_in_OBO[":TYPE"] = "located_in"
OBO_located_in_OBO.to_pickle(unprocessed_edge_data_location+'OBO_located_in_OBO.pkl')

OBO_location_of_OBO = OBO_located_in_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_location_of_OBO[":TYPE"] = "location_of"
OBO_location_of_OBO.to_pickle(unprocessed_edge_data_location+'OBO_location_of_OBO.pkl')
OBO_location_of_OBO.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0000056 (participates in) - OBO

* [Ribocentre](https://www.ribocentre.org/) <br />
Ribocentre is designed to contain comprehensive information of all natural ribozymes.

In [None]:
# https://www.ribocentre.org/application/ --> Gene Expression system --> CSV button
ribozyme_go = pd.read_csv(unprocessed_data_location + 'Ribocentre - Application.csv')
print(ribozyme_go['ribozyme name'].unique())

ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.replace('glmS ribozyme','SO_0000374')
ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.replace('hammerhead ribozyme','SO_0000380')
ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.replace('LC ribozyme','SO_0000374')
ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.replace('pistol ribozyme','SO_0000374')
ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.replace('RNase P','SO_0000386')
ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.replace('twister ribozyme','SO_0000374')
ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.replace('VS ribozyme','SO_0000374')

ribozyme_go = ribozyme_go[['ribozyme name', 'pubmed ID']]
ribozyme_go['GO'] = ['nan','nan','GO_0015867', 'GO_0032363', 'GO_0010468', 'GO_0010468', 'GO_0010468', 'GO_2000232',
                         'GO_0010468', 'GO_0010468', 'GO_0003743', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'nan', 'GO_0010468',
                         'nan', 'nan', 'nan', 'GO_0050790', 'nan', 'nan', 'nan', 'nan', 'nan', 'GO_0050790', 'nan', 'nan', 'nan', 'nan', 'nan']
ribozyme_go = ribozyme_go[ribozyme_go['GO'] != 'nan']
dict = []
goterms_in_ribocentre = ribozyme_go['GO'].unique()
for term in goterms_in_ribocentre:
    aspect = pd.read_json("https://www.ebi.ac.uk/QuickGO/services/ontology/go/terms/" + term.replace("_",":") + "/complete")['results'][0]
    dict.append(aspect.get("aspect"))
goterms_in_ribocentre_map_relation = pd.DataFrame({'GO':goterms_in_ribocentre, 'Aspect':dict})

ribozyme_go = pd.merge(ribozyme_go, goterms_in_ribocentre_map_relation, on='GO')
# HDV ribozyme is from delta virus, taxid = 12475

ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.replace("HDV ribozyme", "URS00006C745E, URS00006C1D09")
ribozyme_go['ribozyme name'] = ribozyme_go['ribozyme name'].str.split(", ")
ribozyme_go = ribozyme_go.explode('ribozyme name')
ribozyme_go = ribozyme_go.rename(columns={'ribozyme name':'RNA'})

ribozyme_go['Source'] = 'Ribocentre'

ribozyme_go['pubmed ID'] = pd.to_numeric(ribozyme_go['pubmed ID'], errors='coerce')
ribozyme_go['pubmed ID'] = ribozyme_go['pubmed ID'].astype(str)
ribozyme_go['pubmed ID'] = ribozyme_go['pubmed ID'].str.replace(".0", "")
ribozyme_go['pubmed ID'] = ribozyme_go['pubmed ID'].replace("nan", np.nan)

ribozyme_go_so = ribozyme_go[ribozyme_go['RNA'].str.startswith('SO')][['RNA','GO','pubmed ID','Aspect','Source']]
ribozyme_go_so.rename(columns={'RNA':':START_ID','GO':':END_ID','pubmed ID':'PubMedID'},inplace=True)
ribozyme_go_so_mf = ribozyme_go_so[ribozyme_go_so['Aspect'] == 'molecular_function'].drop(columns=['Aspect'])
ribozyme_go_so_bp = ribozyme_go_so[ribozyme_go_so['Aspect'] == 'biological_process'].drop(columns=['Aspect'])
ribozyme_go_rnacentral = ribozyme_go[~ribozyme_go['RNA'].str.startswith('SO')][['RNA','GO','pubmed ID','Aspect','Source']]
print(ribozyme_go_rnacentral['Aspect'].unique())
ribozyme_go_rnacentral.drop(columns=['Aspect'],inplace=True)

ribozyme_go_so_bp.head(n=3)

In [None]:
OBO_participates_in_OBO = ribozyme_go_so_bp.groupby([':START_ID',':END_ID']).agg({'Source':set,'PubMedID':set}).reset_index()
OBO_participates_in_OBO[":TYPE"] = "participates_in"
OBO_participates_in_OBO.to_pickle(unprocessed_edge_data_location+'OBO_participates_in_OBO.pkl')

OBO_has_participant_OBO = OBO_participates_in_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_has_participant_OBO[":TYPE"] = "has_participant"
OBO_has_participant_OBO.to_pickle(unprocessed_edge_data_location+'OBO_has_participant_OBO.pkl')
OBO_has_participant_OBO.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0000085 (has function) - OBO

* [Ribocentre](https://www.ribocentre.org/) <br />
Ribocentre is designed to contain comprehensive information of all natural ribozymes.

In [None]:
ribozyme_go_so_mf.head(n=3)

In [None]:
OBO_has_function_OBO = ribozyme_go_so_mf.groupby([':START_ID',':END_ID']).agg({'Source':set,'PubMedID':set}).reset_index()
OBO_has_function_OBO[":TYPE"] = "has_function"
OBO_has_function_OBO.to_pickle(unprocessed_edge_data_location+'OBO_has_function_OBO.pkl')

OBO_function_of_OBO = OBO_has_function_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_function_of_OBO[":TYPE"] = "function_of"
OBO_function_of_OBO.to_pickle(unprocessed_edge_data_location+'OBO_function_of_OBO.pkl')
OBO_function_of_OBO.head(n=3)

***
### Gene - http://purl.obolibrary.org/obo/RO_0002205 (has gene product) - OBO

* Ensembl

In [None]:
entrez_pro = entrez_pro_map.copy()
entrez_pro['Source'] = 'Ensembl'
entrez_pro.rename(columns={0:':START_ID',1:':END_ID'},inplace=True)
entrez_pro.head(n=3)

In [None]:
gene_has_gene_product_OBO = entrez_pro.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
gene_has_gene_product_OBO[":TYPE"] = "has_gene_product"
gene_has_gene_product_OBO.to_pickle(unprocessed_edge_data_location+'gene_has_gene_product_OBO.pkl')

OBO_gene_product_of_gene = gene_has_gene_product_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_gene_product_of_gene[":TYPE"] = "gene_product_of"
OBO_gene_product_of_gene.to_pickle(unprocessed_edge_data_location+'OBO_gene_product_of_gene.pkl')
OBO_gene_product_of_gene.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002510 (transcribed from) - Gene

* Ensembl

In [None]:
gene_RNA = pd.read_csv(processed_data_location + 'ENTREZ_GENE_ENSEMBL_TRANSCRIPT_MAP.txt', sep='\t', header=None)[[0,1]].drop_duplicates()
gene_RNA['Source'] = 'Ensembl'

gene_RNA = gene_RNA.merge(rnacentral_map_human_ensembl[['RNAcentral ID','Ensembl transcript ID']], left_on=1,
               right_on='Ensembl transcript ID', how='left').rename(
                   columns={0:':END_ID', 'RNAcentral ID':':START_ID'})
gene_RNA[':START_ID'] = gene_RNA[':START_ID'].fillna(gene_RNA[1])
gene_RNA.drop(columns=[1,'Ensembl transcript ID'], inplace=True)
gene_RNA.head(n=3)

* circBase

In [None]:
circbase = pd.read_csv(unprocessed_data_location + 'hsa_hg19_circRNA.txt', sep='\t')
circbase_transcribed_from = circbase[['circRNA ID','gene symbol']]
circbase_transcribed_from = pd.merge(circbase_transcribed_from, symbol_entrez_map.rename(columns={0:'gene symbol'}),
                               on='gene symbol').drop(columns=['gene symbol']).rename(columns={1:'Gene','circRNA ID':'RNA'}).drop_duplicates()
circbase_transcribed_from['Source'] = 'circBase'
circbase_transcribed_from = circbase_transcribed_from.rename(columns={'Gene':':END_ID','RNA':':START_ID'})
circbase_transcribed_from.head(n=3)

* [piRBase](http://bigdata.ibp.ac.cn/piRBase/)

In [None]:
df = pd.DataFrame()
for i in rnacentral_map_human_pirbase['piRBase ID'].unique():
    #response = requests.get(url = 'http://bigdata.ibp.ac.cn/piRBase/pirna.php?name=' + i)
    #with open(f'../resources/processed_data/unprocessed_data/piRBase/{i}.html', 'wb') as file:
    #    file.write(response.content)
    temp = pd.read_html(f'../resources/processed_data/unprocessed_data/piRBase/{i}.html')[2]
    temp['piRNA'] = i
    df = pd.concat([df, temp])

df = df[['piRNA','Gene']].drop_duplicates()
df = df[df['Gene'].notna()]
df.head(n=3)

In [None]:
df['Gene'] = df['Gene'].str.replace(";", "")
df['Gene'] = df['Gene'].str.split(" ")
df = df.explode('Gene')
df_ensembl = df.merge(ensembl_entrezGene_map[[0,1]].rename(columns={0:'Gene'}), on='Gene').drop(columns=['Gene']).rename(columns={1:'Gene'})
df_symbol = df.merge(symbol_entrez_map[[0,1]].rename(columns={0:'Gene'}), on='Gene').drop(columns=['Gene']).rename(columns={1:'Gene'})
df = pd.merge(df, rnacentral_map_human_pirbase.rename(columns={'piRBase ID':'piRNA'}), on='piRNA').drop(
    columns=['piRNA']).rename(columns={'RNAcentral ID':'RNA'})
df = pd.concat([df_ensembl, df_symbol]).drop_duplicates()

df = df.merge(rnacentral_map_human_pirbase.rename(columns={'piRBase ID':'piRNA'}), on='piRNA').drop(columns=['piRNA'])
df['Source'] = 'piRBase'
df.rename(columns={'Gene':':END_ID','RNAcentral ID':':START_ID'},inplace=True)
df.head(n=3)

In [None]:
RNA_transcribed_from_Gene = pd.concat([df, circbase_transcribed_from, gene_RNA])
RNA_transcribed_from_Gene = RNA_transcribed_from_Gene.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
RNA_transcribed_from_Gene[":TYPE"] = "transcribed_from"
RNA_transcribed_from_Gene.to_pickle(unprocessed_edge_data_location+'RNA_transcribed_from_gene.pkl')

Gene_transcribed_to_RNA = RNA_transcribed_from_Gene.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
Gene_transcribed_to_RNA[":TYPE"] = "transcribed_to"
Gene_transcribed_to_RNA.to_pickle(unprocessed_edge_data_location+'gene_transcribed_to_RNA.pkl')
Gene_transcribed_to_RNA.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002566 (causally influences) - RNA

* [TarpiD](https://tarpid.nitrkl.ac.in/tarpid_db/)

In [None]:
piRNA_disease2 = pd.DataFrame() # Mondo+HPO

for i in piRNA_gene["0"].unique():
    url = "https://tarpid.nitrkl.ac.in/tarpid_db/specific_search/pirna_detail.php?pirna=" + i
    response = requests.get(url)
    html_content = response.text

    soup = BeautifulSoup(html_content, 'html.parser')
    detail_content = soup.find_all('div', class_='detail_content_right')[9].text

    content = detail_content.replace("&nbsp","").lower().split(', ')
    df = pd.DataFrame(content)
    df[1] = i
    piRNA_disease2 = pd.concat([piRNA_disease2, df])

#piRNA_disease2.drop_duplicates().to_csv(unprocessed_data_location + 'piRNA-disease.txt', sep='\t', index=None)
#piRNA_disease2 = pd.read_csv(unprocessed_data_location + 'piRNA-disease.txt', sep='\t')

piRNA_disease2 = piRNA_disease2[piRNA_disease2["0"] != '-']    
piRNA_disease2.head(n=3)

In [None]:
piRNA_disease2 = piRNA_disease2[(~piRNA_disease2["1"].str.contains("sse")) &  (~piRNA_disease2["1"].str.contains("mmu")) & (piRNA_disease2["1"].str.contains(r'^piR-\d')) &
                        (~piRNA_disease2["1"].str.contains("gga")) & (~piRNA_disease2["1"].str.contains("ur"))]
piRNA_disease2["1"] = piRNA_disease2["1"].str.replace("piR-", "piR-hsa-")
piRNA_disease2 = pd.merge(piRNA_disease2, rnacentral_map_human_pirbase.rename(columns={'piRBase ID':'1'}), on='1').drop(
    columns=['1']).rename(columns={'RNAcentral ID':'RNA'})
piRNA_disease2["0"].unique()

In [None]:
piRNA_disease2 = pd.merge(piRNA_disease2, desc_disPhe_map, left_on="0", right_on=0).drop(columns=[0, "0"]).rename(columns={1:'Disease'})[['Disease', "RNA"]]
piRNA_disease2['Source'] = 'TarpiD'
piRNA_disease2 = piRNA_disease2.rename(columns={'Disease':':START_ID','RNA':':END_ID'})
piRNA_disease2.head(n=3)

In [None]:
piRNA_disease2 = piRNA_disease2.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
piRNA_disease2[":TYPE"] = "causally_influences"
piRNA_disease2.to_pickle(unprocessed_edge_data_location+'OBO_causally_influences_RNA.pkl')

piRNA_disease2.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'},inplace=True)
piRNA_disease2[":TYPE"] = "causally_influenced_by"
piRNA_disease2.to_pickle(unprocessed_edge_data_location+'RNA_causally_influenced_by_OBO.pkl')
piRNA_disease2.head(n=3)

***
### SNP - http://purl.obolibrary.org/obo/RO_0003302 (causes or contributes to condition) - OBO

* ClinVar

In [None]:
clinvar_data = pd.read_csv(unprocessed_data_location + 'variant_summary.txt', header=0,
                           delimiter='\t', low_memory=False)[['Name', 'GeneID','Assembly','ClinSigSimple',
                                                              'RS# (dbSNP)','PhenotypeIDS','ReviewStatus']]
clinvar_data = clinvar_data[clinvar_data['ClinSigSimple'] != 1].drop(columns=['ClinSigSimple'])
clinvar_data = clinvar_data[clinvar_data['GeneID'] != -1].drop(columns=['GeneID'])
clinvar_data = clinvar_data[(clinvar_data['ReviewStatus'] == "criteria provided, multiple submitters, no conflicts")
                            | (clinvar_data['ReviewStatus'] == "reviewed by expert panel")
                            | (clinvar_data['ReviewStatus'] == "practice guideline")].drop(columns=['ReviewStatus'])
clinvar_data = clinvar_data[clinvar_data['RS# (dbSNP)'] != -1]
clinvar_data['RS# (dbSNP)'] = 'rs' + clinvar_data['RS# (dbSNP)'].astype(str)
clinvar_data['Name'] = clinvar_data['Name'].str.split(".").str[0]
clinvar_data = clinvar_data[clinvar_data['Assembly'] == 'GRCh38'].drop(columns=['Assembly'])

clinvar_data['PhenotypeIDS'] = clinvar_data['PhenotypeIDS'].str.split("\|")
clinvar_data = clinvar_data.explode('PhenotypeIDS')
clinvar_data['PhenotypeIDS'] = clinvar_data['PhenotypeIDS'].str.split(";")
clinvar_data = clinvar_data.explode('PhenotypeIDS')
clinvar_data['PhenotypeIDS'] = clinvar_data['PhenotypeIDS'].str.split(",")
clinvar_data = clinvar_data.explode('PhenotypeIDS')
clinvar_data['PhenotypeIDS'] = clinvar_data['PhenotypeIDS'].str.replace("MONDO:MONDO:", "MONDO_")
clinvar_data['PhenotypeIDS'] = clinvar_data['PhenotypeIDS'].str.replace("Human Phenotype Ontology:HP:", "HP_")
clinvar_data = clinvar_data[(clinvar_data['PhenotypeIDS'].str.startswith('HP')) | (clinvar_data['PhenotypeIDS'].str.startswith('MONDO'))]
clinvar_data['Source'] = 'ClinVar'

clinvar_data = pd.merge(clinvar_data, rnacentral_map_human_refseq[['RNAcentral ID','RefSeq ID']].drop_duplicates().rename(
    columns={'RefSeq ID':'Name'}), on='Name', how='left')
clinvar_data['RNAcentral ID'] = clinvar_data['RNAcentral ID'].fillna(clinvar_data['Name'])

clinvar_data.rename(columns={'RS# (dbSNP)':':START_ID','PhenotypeIDS':':END_ID','RNAcentral ID':'Interactor'},inplace=True)
clinvar_data.head(n=3)

* [PolymiRTS](https://compbio.uthsc.edu/miRSNP/home.php)

In [None]:
mrna_disease = pd.read_csv(unprocessed_data_location + 'Genes_associated_with_human_diseases_traits.txt',sep='\t').drop(columns=['Study','Link'])
mrna_disease['p-Value'] = pd.to_numeric(mrna_disease['p-Value'], errors='coerce') # Mondo+HPO  
mrna_disease = mrna_disease[mrna_disease['p-Value'] < 0.01]
mrna_disease['Disease/Trait'] = mrna_disease['Disease/Trait'].str.lower()
mrna_disease = pd.merge(mrna_disease, desc_disPhe_map.rename(columns={0:'Disease/Trait'}), on='Disease/Trait').drop(
    columns=['Disease/Trait']).rename(columns={1:'Disease'})

mrna_disease = pd.merge(mrna_disease, rnacentral_map_human_refseq[['RNAcentral ID','RefSeq ID']].drop_duplicates().rename(
    columns={'RefSeq ID':'RefSeQID'}), on='RefSeQID', how='left')
mrna_disease['RNAcentral ID'] = mrna_disease['RNAcentral ID'].fillna(mrna_disease['RefSeQID'])

mrna_disease['PUBMEDID'] = pd.to_numeric(mrna_disease['PUBMEDID'], errors='coerce')
mrna_disease['PUBMEDID'] = mrna_disease['PUBMEDID'].astype(str)
mrna_disease['PUBMEDID'] = mrna_disease['PUBMEDID'].str.replace(".0", "")
mrna_disease['PUBMEDID'] = mrna_disease['PUBMEDID'].replace("nan", np.nan)

mrna_disease['Source'] = 'PolymiRTS'
mrna_disease.rename(columns={'SNPs':':START_ID','Disease':':END_ID','p-Value':'p-value',
                             'RNAcentral':'Interactor','PUBMEDID':'PubMedID'},inplace=True)
mrna_disease.head(n=3)

In [None]:
snp_causally_influences_obo = pd.concat([clinvar_data, mrna_disease])
snp_causally_influences_obo = snp_causally_influences_obo.groupby([':START_ID',':END_ID']).agg({'Source':set,'Interactor':set,
                                                                                                'PubMedID':set,'p-value':np.mean}).reset_index()
snp_causally_influences_obo['TYPE'] = "causes_or_contributes_to_condition"
snp_causally_influences_obo.to_pickle(unprocessed_edge_data_location+'SNP_causes_or_contributes_to_condition_OBO.pkl')
snp_causally_influences_obo.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/BFO_0000050 (part of) - OBO

* [The GO resource](https://geneontology.org/) (GO annotations) <br/> The Gene Ontology (GO) knowledgebase is the world’s largest source of information on the functions of genes. This knowledge is both human-readable and machine-readable, and is a foundation for computational analysis of large-scale molecular biology and genetics experiments in biomedical research. 

In [None]:
import gzip 
with gzip.open(unprocessed_data_location+'goa_human_rna.gaf.gz') as f: # GO
    go_annotations = pd.read_csv(f, comment='!', delimiter='\t', names=[
        'DB', 'DB Object ID', 'DB Object Symbol', 'Qualifier', 'GO ID', 'DB:Reference (|DB:Reference)', 'Evidence Code',
        'With (or) From', 'Aspect', 'DB Object Name', 'DB Object Synonym (|Synonym)', 'DB Object Type', 'Taxon(|taxon)',
        'Date', 'Assigned By', 'Annotation Extension', 'Gene Product Form ID'])

go_annotations = go_annotations[go_annotations['Taxon(|taxon)'] == 'taxon:9606']
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('enables', '2327') # RO_0002327
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('involved_in', '2331')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('located_in', '1025')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('part_of', 'BFO50')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('acts_upstream_of', '2263')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('acts_upstream_of_or_within', '2264')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('is_active_in', '2432')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('acts_upstream_of_or_within_negative_effect', '4033')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('acts_upstream_of_negative_effect', '4035')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('colocalizes_with', '2325')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('contributes_to', '2326')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('NOT|involved_in', '2331?NOT')
go_annotations['Qualifier'] = go_annotations['Qualifier'].replace('NOT|located_in', '1025?NOT')
print("Are all 'DB Object ID' and 'DB Object Symbol' cells equal?", all(go_annotations['DB Object ID'] == go_annotations['DB Object Symbol']))
go_annotations = go_annotations.drop(columns=['DB Object Symbol'])
go_annotations['GO ID'] = go_annotations['GO ID'].str.replace('GO:', 'GO_')
go_annotations['DB Object ID'] = go_annotations['DB Object ID'].str.replace('_.*', '', regex=True)
go_annotations = go_annotations.drop(columns=['Gene Product Form ID', 'DB Object Synonym (|Synonym)', 'DB','Date','Annotation Extension',
                                              'With (or) From','DB Object Name',
                                              'DB Object Type','Taxon(|taxon)',"Aspect"])

print(go_annotations[go_annotations['DB Object ID'].isna()])
go_annotations['Assigned By'] = "GOC|" + go_annotations['Assigned By'].astype(str)
go_annotations['Assigned By'] = go_annotations['Assigned By'].replace("GOC|GOC", "GOC")
go_annotations['Assigned By'] = go_annotations['Assigned By'].replace("GOC|BHF-UCL", "GOC")
go_annotations['Assigned By'] = go_annotations['Assigned By'].replace("GOC|ARUK-UCL", "GOC")
go_annotations['Assigned By'] = go_annotations['Assigned By'].replace("GOC|DIBU", "GOC")
go_annotations['Assigned By'] = go_annotations['Assigned By'].replace("GOC|ParkinsonsUK-UCL", "GOC")
go_annotations['Assigned By'] = go_annotations['Assigned By'].replace("GOC|FlyBase", "GOC")
go_annotations['Assigned By'] = go_annotations['Assigned By'].replace("GOC|UniProt", "GOC|UniProtKB")
go_annotations.rename(columns={'DB Object ID':':START_ID','GO ID':':END_ID','Evidence Code':'GO_evidence','Assigned By':'Source'},inplace=True)
go_annotations['Source'] = go_annotations['Source'].str.split('|')
go_annotations = go_annotations.explode('Source')

go_annotations['DB:Reference (|DB:Reference)'] = go_annotations['DB:Reference (|DB:Reference)'].str.split('PMID:').str[1]
go_annotations['DB:Reference (|DB:Reference)'] = pd.to_numeric(go_annotations['DB:Reference (|DB:Reference)'], errors='coerce')
go_annotations['DB:Reference (|DB:Reference)'] = go_annotations['DB:Reference (|DB:Reference)'].replace(0, np.nan)
go_annotations['DB:Reference (|DB:Reference)'] = go_annotations['DB:Reference (|DB:Reference)'].replace("<NA>", np.nan)
go_annotations['DB:Reference (|DB:Reference)'] = pd.to_numeric(go_annotations['DB:Reference (|DB:Reference)'], errors='coerce')
go_annotations['DB:Reference (|DB:Reference)'] = go_annotations['DB:Reference (|DB:Reference)'].astype(str)
go_annotations['DB:Reference (|DB:Reference)'] = go_annotations['DB:Reference (|DB:Reference)'].str.replace(".0", "")
go_annotations['DB:Reference (|DB:Reference)'] = go_annotations['DB:Reference (|DB:Reference)'].replace("<NA>","nan")
go_annotations['DB:Reference (|DB:Reference)'] = go_annotations['DB:Reference (|DB:Reference)'].replace("nan",np.nan)
go_annotations.rename(columns={'DB:Reference (|DB:Reference)':'PubMedID'},inplace=True)

go_annotationsBFO50 = go_annotations[go_annotations['Qualifier'] == 'BFO50'].drop(columns=['Qualifier'])
go_annotationsBFO50.head(n=3)

* [Vesiclepedia](http://microvesicles.org/index.html) <br/> Vesiclepedia is a a manually curated compendium of molecular data (lipid, RNA and protein) identified in different classes of extracellular vesicles. 

In [None]:
!wget http://microvesicles.org/Archive/Vesiclepedia_RNAs_GPAD.txt -O ../resources/processed_data/unprocessed_data/Vesiclepedia_RNAs_GPAD.txt

In [None]:
go_annotationsV = pd.read_csv(unprocessed_data_location+"Vesiclepedia_RNAs_GPAD.txt", comment='!', delimiter='\t', names=[
        'DB','DB Object ID','Qualifier','GO ID','DB:Reference(s)','Evidence Code','With (or) From','Interacting taxon ID','Date',
        'Assigned By','Annotation Extension','Annotation Properties'])
go_annotationsV['DB'] = go_annotationsV['DB'] + ", " + go_annotationsV['Assigned By']
go_annotationsV.drop(columns=['Assigned By','With (or) From','Interacting taxon ID','Date','Annotation Properties'], inplace=True)
go_annotationsV = go_annotationsV[go_annotationsV['DB Object ID'].str.endswith('_9606')] # aggiungere poi le ulteriori specie considerate
go_annotationsV['DB Object ID'] = go_annotationsV['DB Object ID'].str.replace("_9606",'')
go_annotationsV['GO ID'] = go_annotationsV['GO ID'].str.replace(':','_')
go_annotationsV['DB:Reference(s)'] = go_annotationsV['DB:Reference(s)'].str.replace('PMID:','')
print(go_annotationsV['Qualifier'].unique())
print(go_annotationsV['DB'].unique())
go_annotationsV['DB'] = go_annotationsV['DB'].str.split(", ")
go_annotationsV = go_annotationsV.explode('DB')
go_annotationsV['Evidence Code'] = go_annotationsV['Evidence Code'].str.split(':').str[0]
print(go_annotationsV['Evidence Code'].unique())

go_annotationsV['DB:Reference(s)'] = pd.to_numeric(go_annotationsV['DB:Reference(s)'], errors='coerce')
go_annotationsV['DB:Reference(s)'] = go_annotationsV['DB:Reference(s)'].astype(str)
go_annotationsV['DB:Reference(s)'] = go_annotationsV['DB:Reference(s)'].str.replace(".0", "")
go_annotationsV['DB:Reference(s)'] = go_annotationsV['DB:Reference(s)'].replace("nan", np.nan)

go_annotationsV['Annotation Extension'] = go_annotationsV['Annotation Extension'].str.split("(").str[1]
go_annotationsV['Annotation Extension'] = go_annotationsV['Annotation Extension'].str.replace(":", "_").str.replace("\)", "",regex=True)
go_annotationsV = pd.merge(go_annotationsV, desc_anatomyCell_map.rename(columns={1:'Annotation Extension'}), on='Annotation Extension', how='left')
go_annotationsV[0] = go_annotationsV[0].fillna(go_annotationsV['Annotation Extension'])

go_annotationsV = go_annotationsV.rename(columns={'DB':'Source','DB Object ID':':START_ID','GO ID':':END_ID',0:'Location',
                                                  'Evidence Code':'GO_evidence','DB:Reference(s)':'PubMedID'})
go_annotationsV.head(n=3)

In [None]:
go_annotationsBFO50 = pd.concat([go_annotationsBFO50, go_annotationsV])
go_annotationsBFO50 = go_annotationsBFO50.groupby([':START_ID',':END_ID']).agg({'GO_evidence':set,'Source':set,'PubMedID':set,
                                                                                'Location':set}).reset_index()
go_annotationsBFO50[':TYPE'] = 'part_of'
go_annotationsBFO50.to_pickle(unprocessed_edge_data_location+'RNA_part_of_OBO.pkl')

go_annotationsBFO50 = go_annotationsBFO50.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
go_annotationsBFO50[':TYPE'] = 'has_part'
go_annotationsBFO50.to_pickle(unprocessed_edge_data_location+'OBO_has_part_RNA.pkl')
go_annotationsBFO50.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002331?NOT (not involved in) - OBO

* [The GO resource](https://geneontology.org/) (GO annotations)

In [None]:
go_annotations2331NOT = go_annotations[go_annotations['Qualifier'] == '2331?NOT'].drop(columns=['Qualifier'])

go_annotations2331NOT = go_annotations2331NOT.groupby([':START_ID',':END_ID']).agg({'GO_evidence':set,'Source':set,'PubMedID':set}).reset_index()
go_annotations2331NOT[':TYPE'] = 'not_involved_in'
go_annotations2331NOT.to_pickle(unprocessed_edge_data_location+'RNA_not_involved_in_OBO.pkl')
go_annotations2331NOT.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0001025?NOT (not located in) - OBO

* [The GO resource](https://geneontology.org/) (GO annotations)

In [None]:
go_annotations1025NOT = go_annotations[go_annotations['Qualifier'] == '1025?NOT'].drop(columns=['Qualifier'])
go_annotations1025NOT = go_annotations1025NOT.groupby([':START_ID',':END_ID']).agg({'GO_evidence':set,'Source':set,'PubMedID':set}).reset_index()
go_annotations1025NOT[':TYPE'] = 'not_located_in'
go_annotations1025NOT.to_pickle(unprocessed_edge_data_location+'RNA_not_located_in_OBO.pkl')

go_annotations1025NOT = go_annotations1025NOT.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
go_annotations1025NOT[':TYPE'] = 'not_location_of'
go_annotations1025NOT.to_pickle(unprocessed_edge_data_location+'OBO_not_location_of_RNA.pkl')
go_annotations1025NOT.head()

***
### RNA - http://purl.obolibrary.org/obo/RO_0002432 (is active in) - OBO

* [The GO resource](https://geneontology.org/) (GO annotations)

In [None]:
go_annotations2432 = go_annotations[go_annotations['Qualifier'] == '2432'].drop(columns=['Qualifier'])
go_annotations2432 = go_annotations2432.groupby([':START_ID',':END_ID']).agg({'GO_evidence':set,'Source':set,'PubMedID':set}).reset_index()
go_annotations2432[':TYPE'] = 'is_active_in'
go_annotations2432.to_pickle(unprocessed_edge_data_location+'RNA_is_active_in_OBO.pkl')
go_annotations2432.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0004033 (acts upstream of or within, negative effect) - OBO

* [The GO resource](https://geneontology.org/) (GO annotations)

In [None]:
go_annotations4033 = go_annotations[go_annotations['Qualifier'] == '4033'].drop(columns=['Qualifier'])
go_annotations4033 = go_annotations4033.groupby([':START_ID',':END_ID']).agg({'GO_evidence':set,'Source':set,'PubMedID':set}).reset_index()
go_annotations4033[':TYPE'] = 'acts_upstream_of_or_within_negative_effect'
go_annotations4033.to_pickle(unprocessed_edge_data_location+'RNA_acts_upstream_of_or_within_negative_effect_OBO.pkl')
go_annotations4033.head()

***
### RNA - http://purl.obolibrary.org/obo/RO_0002325 (colocalizes with) - OBO

* [The GO resource](https://geneontology.org/) (GO annotations)

In [None]:
go_annotations2325 = go_annotations[go_annotations['Qualifier'] == '2325'].drop(columns=['Qualifier'])
go_annotations2325 = go_annotations2325.groupby([':START_ID',':END_ID']).agg({'GO_evidence':set,'Source':set,'PubMedID':set}).reset_index()
go_annotations2325[':TYPE'] = 'colocalizes_with'
go_annotations2325.to_pickle(unprocessed_edge_data_location+'RNA_colocalizes_with_OBO.pkl')

go_annotations2325 = go_annotations2325.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
go_annotations2325[':TYPE'] = 'colocalizes_with'
go_annotations2325.to_pickle(unprocessed_edge_data_location+'OBO_colocalizes_with_RNA.pkl')
go_annotations2325.head()

***
### RNA - http://purl.obolibrary.org/obo/RO_0002264 (acts upstream of or within) - OBO

* [The GO resource](https://geneontology.org/) (GO annotations)

In [None]:
go_annotations2264 = go_annotations[go_annotations['Qualifier'] == '2264'].drop(columns=['Qualifier'])
go_annotations2264 = go_annotations2264.groupby([':START_ID',':END_ID']).agg({'GO_evidence':set,'Source':set,'PubMedID':set}).reset_index()
go_annotations2264[':TYPE'] = 'acts_upstream_of_or_within'
go_annotations2264.to_pickle(unprocessed_edge_data_location+'RNA_acts_upstream_of_or_within_OBO.pkl')
go_annotations2264.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002263 (acts upstream of) - OBO

* [The GO resource](https://geneontology.org/) (GO annotations)

In [None]:
go_annotations2263 = go_annotations[go_annotations['Qualifier'] == '2263'].drop(columns=['Qualifier'])
go_annotations2263 = go_annotations2263.groupby([':START_ID',':END_ID']).agg({'GO_evidence':set,'Source':set,'PubMedID':set}).reset_index()
go_annotations2263[':TYPE'] = 'acts_upstream_of'
go_annotations2263.to_pickle(unprocessed_edge_data_location+'RNA_acts_upstream_of_OBO.pkl')
go_annotations2263.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_000002327 (enables) - OBO

* [The GO resource](https://geneontology.org/) (GO annotations)

In [None]:
go_annotations2327 = go_annotations[go_annotations['Qualifier'] == '2327'].drop(columns=['Qualifier'])
go_annotations2327 = go_annotations2327.groupby([':START_ID',':END_ID']).agg({'GO_evidence':set,'Source':set,'PubMedID':set}).reset_index()
go_annotations2327[':TYPE'] = 'enables'
go_annotations2327.to_pickle(unprocessed_edge_data_location+'RNA_enables_OBO.pkl')

go_annotations2327 = go_annotations2327.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
go_annotations2327[':TYPE'] = 'enabled_by'
go_annotations2327.to_pickle(unprocessed_edge_data_location+'OBO_enabled_by_RNA.pkl')
go_annotations2327.head(n=3)

***
### RNA - http://purl.obolibrary.org/obo/RO_0002331 (involved in) - OBO

* [DirectRMDB](http://www.rnamd.org/directRMDB/index.html) <br/> DirectRMDB is a database of quantitative RNA modification profiles.

In [None]:
# http://www.rnamd.org/directRMDB/download.html --> Homo sapiens --> ZIP button --> Unzip

In [None]:
genome = pd.read_csv(unprocessed_data_location+'HomoSapiens/HomoSapiens_genome.txt', sep='\t')[[ # Modification -- SO
    'ID','modification','software','cell_line','Ensembl_ID','Gene_Biotype']]
mirna = pd.read_csv(unprocessed_data_location+'HomoSapiens/HomoSapiens_miRNA.txt', sep='\t')[['ID','Source','Name']].rename(
    columns={'Name':'miRNA'})
rbp = pd.read_csv(unprocessed_data_location+'HomoSapiens/HomoSapiens_RBP.txt', sep='\t')[['ID','Source','Name']].rename(columns={'Name':'RBP'})
snp = pd.read_csv(unprocessed_data_location+'HomoSapiens/HomoSapiens_SNP.txt', sep='\t')[['ID','rs_ID']]
rna_mod = pd.merge(genome, mirna, on='ID').merge(rbp, on='ID').merge(snp, on='ID')

rna_mod['Source'] = 'DirectRMDB'
rna_mod = rna_mod[~rna_mod['Ensembl_ID'].isna()]
rna_mod['Source'] = rna_mod['Source'].astype(str) + ", " + rna_mod['Source_x'].astype(str) + ", " + rna_mod['Source_y'].astype(str)
rna_mod['Source'] = rna_mod['Source'].str.replace(", nan", "")
rna_mod['Source'] = rna_mod['Source'].str.replace("nan, ", "")
rna_mod['Source'] = rna_mod['Source'].str.split(", ")
rna_mod = rna_mod.explode('Source')
rna_mod = rna_mod[['modification','software','cell_line','Ensembl_ID','Gene_Biotype','miRNA','RBP','rs_ID','Source']]

rna_mod['Ensembl_ID'] = rna_mod['Ensembl_ID'].str.split('.').str[0]
rna_mod = pd.merge(rna_mod, ensembl_map[['ensembl_gene_id','transcript_stable_id','ensembl_transcript_type']].drop_duplicates(),
                   left_on=['Ensembl_ID','Gene_Biotype'], right_on=['ensembl_gene_id','ensembl_transcript_type'])

rna_mod['cell_line'] = rna_mod['cell_line'].str.lower().str.split(";")
rna_mod = rna_mod.explode('cell_line')
rna_mod = pd.merge(rna_mod, location_map, right_on='0_y', left_on='cell_line', how='left')
rna_mod['0_x'] = rna_mod['0_x'].fillna(rna_mod['cell_line'])
rna_mod = rna_mod.drop(columns=['0_y', 'cell_line'])
rna_mod = rna_mod.rename(columns={'0_x':'Location'})

rna_mod['software'] = rna_mod['software'].str.lower().str.split(";")
rna_mod = rna_mod.explode('software')
rna_mod = pd.merge(rna_mod, method_map, right_on='0_y', left_on='software', how='left')
rna_mod['0_x'] = rna_mod['0_x'].fillna(rna_mod['software'])
rna_mod = rna_mod.drop(columns=['0_y', 'software'])
rna_mod = rna_mod.rename(columns={'0_x':'Method'})

rna_mod = rna_mod[['transcript_stable_id','modification','Method','Location','miRNA','RBP','rs_ID','Source']].drop_duplicates()
rna_mod.head(n=3)

In [None]:
print(rna_mod.modification.unique())

rna_mod.modification.replace({'Psi':'SO_0001373',
'm5C': 'SO_0001918',
'm6A': 'SO_0001920',
'm7G':'SO_0001326',
'm1A': 'SO_0001295',
'Cm': 'SO_0001283',
'Tm': 'SO_0001382',
'Am': 'SO_0001298',
'Gm': 'SO_0001327',
'm5U': 'SO_0001344'},inplace=True)

rna_mod_rnacentral = pd.merge(rna_mod, rnacentral_map_human_ensembl[['RNAcentral ID','Ensembl transcript ID']].drop_duplicates().rename(
    columns={'Ensembl transcript ID':'transcript_stable_id'}), on = 'transcript_stable_id').drop(columns=['transcript_stable_id']).rename(
        columns={'RNAcentral ID':'transcript_stable_id'})
rna_mod_ensembl = rna_mod[~rna_mod['transcript_stable_id'].isin(rna_mod_rnacentral['transcript_stable_id'])]
rna_mod = pd.concat([rna_mod_ensembl, rna_mod_rnacentral]).rename(columns={'transcript_stable_id':':START_ID','rs_ID':'Mutation','modification':':END_ID'})
rna_mod = pd.concat([rna_mod.rename(columns={'miRNA':'Interactor'}),
                     rna_mod.rename(columns={'RBP':'Interactor'})]).drop(columns=['miRNA','RBP']).drop_duplicates()
rna_mod.head(n=3)

* [Modomics](https://genesilico.pl/modomics/) <br/> Modomics is a database of RNA modifications that provides comprehensive information concerning the chemical structures of modified ribonucleosides, their biosynthetic pathways, the location of modified residues in RNA sequences, and RNA modifying enzymes.

In [None]:
# https://genesilico.pl/modomics/snornas --> CSV button
modomics = pd.read_csv(unprocessed_data_location+'modomics2.csv') # Epigenetic modification (SO)
modomics['ORF/Alternative name'] = modomics['ORF/Alternative name'].str.strip()
modomics = modomics[modomics['Organism'].str.contains('apiens')].drop(columns=['Name','Organism'])
modomics = pd.merge(modomics, rnacentral_map_human[['RNAcentral ID','DB Description']].drop_duplicates().rename(
    columns={'DB Description':'ORF/Alternative name'}),on ='ORF/Alternative name').drop(columns=['ORF/Alternative name'])
#modomics['ORF/Alternative name'] = modomics['ORF/Alternative name'].astype(str).str.lower()
#modomics['ORF/Alternative name'] = modomics['ORF/Alternative name'].str[0:3] + 'RNA'
print(modomics['Modification type'].unique())
modomics.head(n=3)

In [None]:
modomics['Modification type'].replace({'Y':'SO_0001332','Cm': 'SO_0001283','Gm':'SO_0001327',
                                       'Am':'SO_0001298','Um':'SO_0001345'},inplace=True)
modomics['Modification position'] = modomics['Modification position'].str.strip().str.replace(r'\s+', '-', regex=True)
modomics['Source'] = 'Modomics'
modomics.rename(columns={'Modification type':':END_ID', 'RNAcentral ID':':START_ID','Modification position':'Position'},inplace=True)
modomics.drop(columns=['Target RNA type','Complex'],inplace=True)
modomics.head(n=3)

* [EpimiR](http://www.jianglab.cn/EpimiR/index.jsp) <br />
The EpimiR database have obtained 1974 regulatory relationships between 19 types of epigenetic modifications (including DNA methylation, histone acetylation, H3K4me3 and H3K27me3, etc.) and 617 miRNAs across 7 species (including Homo sapiens) from nearly 2000 literatures.

In [None]:
!wget https://www.dropbox.com/s/p852ndpck5jasxz/miRNet-mir-epi-hsa.csv?dl=0 -O ../resources/processed_data/unprocessed_data/miRNet-mir-epi-hsa.csv

In [None]:
miRNA_epiMod = pd.read_csv(unprocessed_data_location + 'miRNet-mir-epi-hsa.csv?dl=0')
miRNA_epiMod = miRNA_epiMod[miRNA_epiMod['expression'] == 'high']
miRNA_epiMod.drop(columns=['mirnet','mir_id','note','res_type','year','support','detect','expression'], inplace=True)
miRNA_epiMod['epi_modification'] = miRNA_epiMod.epi_modification.str.split('/')
miRNA_epiMod = miRNA_epiMod.explode('epi_modification')
miRNA_epiMod.insert(1, 'epi_modification', miRNA_epiMod.pop("epi_modification"))

print(all(miRNA_epiMod['mir_acc'].isin(rnacentral_map_human['DB ID'])))
# These are miRBase dead entries
print(miRNA_epiMod[~miRNA_epiMod['mir_acc'].isin(rnacentral_map_human['DB ID'])])
miRNA_epiMod = pd.merge(miRNA_epiMod, rnacentral_map_human.rename(columns={'DB ID':'mir_acc'}), on='mir_acc').drop(
    columns=['DB','Organism','RNA category','DB Description','mir_acc']).rename(columns={'RNAcentral ID':'RNA'})

miRNA_epiMod['Source'] = 'EpimiR'
miRNA_epiMod2 = miRNA_epiMod.copy()
miRNA_epiMod2['Source'] = 'miRNet'
miRNA_epiMod = pd.concat([miRNA_epiMod, miRNA_epiMod2])

miRNA_epiMod['epi_regulator'] = miRNA_epiMod['epi_regulator'].str.split('/')
miRNA_epiMod = miRNA_epiMod.explode('epi_regulator')
miRNA_epiMod['experiment'] = miRNA_epiMod['experiment'].str.split('/')
miRNA_epiMod = miRNA_epiMod.explode('experiment')
miRNA_epiMod['epi_target'] = miRNA_epiMod['epi_target'].str.split('/')
miRNA_epiMod = miRNA_epiMod.explode('epi_target')
miRNA_epiMod['condition'] = miRNA_epiMod['condition'].str.split('/')
miRNA_epiMod = miRNA_epiMod.explode('condition')

miRNA_epiMod['pmid'] = pd.to_numeric(miRNA_epiMod['pmid'], errors='coerce')
miRNA_epiMod['pmid'] = miRNA_epiMod['pmid'].astype(str)
miRNA_epiMod['pmid'] = miRNA_epiMod['pmid'].str.replace(".0", "")
miRNA_epiMod['pmid'] = miRNA_epiMod['pmid'].replace("nan", np.nan)

miRNA_epiMod['experiment'] = miRNA_epiMod['experiment'].str.lower()
miRNA_epiMod['experiment'] = miRNA_epiMod['experiment'].str.split("/")
miRNA_epiMod = miRNA_epiMod.explode('experiment')
miRNA_epiMod = pd.merge(miRNA_epiMod, method_map, right_on='0_y', left_on='experiment', how='left')
miRNA_epiMod['0_x'] = miRNA_epiMod['0_x'].fillna(miRNA_epiMod['experiment'])
miRNA_epiMod = miRNA_epiMod.drop(columns=['0_y', 'experiment'])
miRNA_epiMod = miRNA_epiMod.rename(columns={'0_x':'Method','pmid':'PubMedID','epi_regulator':'Regulator','epi_target':'Interactor'})

miRNA_epiMod['condition'] = miRNA_epiMod['condition'].str.lower()
miRNA_epiMod['condition'] = miRNA_epiMod['condition'].str.split("/")
miRNA_epiMod = miRNA_epiMod.explode('condition')
miRNA_epiMod = pd.merge(miRNA_epiMod, disease_map, right_on='0_y', left_on='condition', how='left')
miRNA_epiMod['0_x'] = miRNA_epiMod['0_x'].fillna(miRNA_epiMod['condition'])
miRNA_epiMod = miRNA_epiMod.drop(columns=['0_y', 'condition'])
miRNA_epiMod = miRNA_epiMod.rename(columns={'0_x':'Location'})

miRNA_epiMod.head(n=3)

In [None]:
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.lower().str.replace('dna methylation','silenced by dna methylation')
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.lower().str.replace('h3k4me','h3k4 methylation site')
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.lower().str.replace('h5ac','histone acetylation')
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.lower().str.replace('h3k9me','h3k9 methylation site')
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.lower().str.replace('h3k27me','h3k27 methylation site')
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.lower().str.replace('h3s10p','phosphorylation site')
miRNA_epiMod['epi_modification'] = miRNA_epiMod['epi_modification'].str.lower().str.replace('h3r17me2','histone methylation site')

miRNA_epiMod = pd.merge(miRNA_epiMod, desc_so_map, left_on='epi_modification', right_on=0)
miRNA_epiMod = miRNA_epiMod.drop(columns=['epi_modification', 0])
miRNA_epiMod = miRNA_epiMod.rename(columns={1:'epi_modification'})

miRNA_epiMod.insert(1, 'epi_modification', miRNA_epiMod.pop("epi_modification"))
miRNA_epiMod.rename(columns={'epi_modification':':END_ID', 'RNA':':START_ID'},inplace=True)
miRNA_epiMod.head(n=3)

In [None]:
rna_involved_in_so = pd.concat([rna_mod, modomics, miRNA_epiMod]).drop_duplicates()
rna_involved_in_so = rna_involved_in_so.groupby([':START_ID',':END_ID']).agg({'Source':set,'Location':set,'Method':set,'Mutation':set,'Position':set,
                                                                              'Regulator':set,'Interactor':set,'PubMedID':set}).reset_index()
rna_involved_in_so[":TYPE"] = "involved_in"
rna_involved_in_so.to_pickle(unprocessed_edge_data_location+'RNA_involved_in_OBO.pkl')
rna_involved_in_so.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002479 (has part that occurs in) - OBO

* [Modomics](https://genesilico.pl/modomics/) <br/> Modomics is a database of RNA modifications that provides comprehensive information concerning the chemical structures of modified ribonucleosides, their biosynthetic pathways, the location of modified residues in RNA sequences, and RNA modifying enzymes.

In [None]:
# https://genesilico.pl/modomics/diseases --> CSV button
modomics = pd.read_csv(unprocessed_data_location+'modomics.csv').drop(columns=['Description']) # PRO -- Epigenetic modification (SO)

modomics['Enzymes'] = modomics['Enzymes'].str.split(' ')
modomics = modomics.explode('Enzymes')
modomics = pd.merge(modomics, symbol_to_pro.rename(columns={0:'Enzymes'}), on='Enzymes')

modomics['Disease Name'] = modomics['Disease Name'].str.lower()
modomics = pd.merge(modomics, location_map, right_on='0_y', left_on='Disease Name', how='left')
modomics['0_x'] = modomics['0_x'].fillna(modomics['Disease Name'])
modomics = modomics.drop(columns=['0_y', 'Disease Name'])
modomics = modomics.rename(columns={'0_x':'Location'})

print(modomics.Reaction.unique())
modomics.drop(columns=['Enzymes'],inplace=True)
modomics.head(n=3)

In [None]:
modomics.Reaction.replace({'C:m5C': 'SO_0001918',
'xX:Xm':'SO_0001353',
'A:m6A': 'SO_0001920',
'A:I': 'GO_0006382',
'C:U':'GO_0016554',
'U:Y':'SO_0001332',
'A:m1A': 'SO_0001295'},inplace=True)
modomics['Source'] = 'Modomics'

pro_so = modomics[modomics['Reaction'].str.contains("SO")]
pro_so.head(n=3)

In [None]:
pro_so.rename(columns={'Reaction':':END_ID', 1:':START_ID'},inplace=True)
pro_so = pro_so.groupby([':START_ID',':END_ID']).agg({'Source':set,'Location':set}).reset_index()
pro_so[":TYPE"] = "has_part_that_occurs_in"
pro_so.to_pickle(unprocessed_edge_data_location+'OBO_has_part_that_occurs_in_OBO.pkl')
pro_so.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002331 (involved in) - OBO

In [None]:
pro_go = modomics[modomics['Reaction'].str.contains("GO")]

pro_go.rename(columns={'Reaction':':END_ID', 1:':START_ID'},inplace=True)
pro_go = pro_go.groupby([':START_ID',':END_ID']).agg({'Source':set,'Location':set}).reset_index()
pro_go[":TYPE"] = "involved_in"
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_involved_in_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo?mod (modified in) - OBO

* [Modomics](https://genesilico.pl/modomics/) <br/> Modomics is a database of RNA modifications that provides comprehensive information concerning the chemical structures of modified ribonucleosides, their biosynthetic pathways, the location of modified residues in RNA sequences, and RNA modifying enzymes.

In [None]:
# https://genesilico.pl/modomics/sequences/ --> Check Unmodifed sequences --> Clck on "FASTA" button

In [None]:
file_path = unprocessed_data_location + 'unmodified_all_all_all_rna_sequences.fasta'
metadata_list = []
sequence_list = []

# Read the file and process each entry
with open(file_path, 'r') as file:
    metadata = None
    sequence = []
    
    for line in file:
        line = line.strip()
        if line.startswith('>'):  # Metadata line
            if metadata:  # If there's existing metadata, save the previous entry
                metadata_list.append(metadata)
                sequence_list.append(''.join(sequence))
            metadata = line[1:]  # Remove '>' and save the metadata
            sequence = []  # Reset sequence
        else:  # Sequence line
            sequence.append(line)
    
    # Don't forget the last entry
    if metadata:
        metadata_list.append(metadata)
        sequence_list.append(''.join(sequence))

# Parse the metadata into a dictionary
metadata_dicts = []
for metadata in metadata_list:
    metadata_dict = {}
    for field in metadata.split('|'):
        if ':' in field:
            key, value = field.split(':', 1)
            metadata_dict[key] = value
    metadata_dicts.append(metadata_dict)

# Create the DataFrame
df = pd.DataFrame(metadata_dicts) # SO -- GO
df['Sequence'] = sequence_list
df = df[df['Species'] == 'Homo sapiens'] # aggiungere poi le ulteriori specie considerate
df = df[['SOterm','Cellular_Localization']]
df['SOterm'] = df['SOterm'].str.replace(":","_")

df = df.merge(desc_go_map.rename(columns={0:"Cellular_Localization"}), on='Cellular_Localization').drop(
    columns='Cellular_Localization').rename(columns={1:'GO'})

df['Source'] = 'Modomics'
df.head(n=3)

In [None]:
df.rename(columns={'GO':':END_ID', 'SOterm':':START_ID'},inplace=True)
df = df.groupby([':START_ID',':END_ID']).agg({'Source':set}).reset_index()
df[":TYPE"] = "modified_in"
df.to_pickle(unprocessed_edge_data_location+'OBO_modified_in_OBO.pkl')
df.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/BFO_0000050 (part of) - OBO

* [Vesiclepedia](http://microvesicles.org/index.html) <br/> Vesiclepedia is a a manually curated compendium of molecular data (lipid, RNA and protein) identified in different classes of extracellular vesicles. 

In [None]:
!wget http://microvesicles.org/Archive/Vesiclepedia_Proteins_GPAD.txt -O ../resources/processed_data/unprocessed_data/Vesiclepedia_Proteins_GPAD.txt

In [None]:
go_annotations = pd.read_csv(unprocessed_data_location+"Vesiclepedia_Proteins_GPAD.txt", comment='!', delimiter='\t', names=[ # PRO -- GO
        'DB','DB Object ID','Qualifier','GO ID','DB:Reference(s)','Evidence Code','With (or) From','Interacting taxon ID','Date',
        'Assigned By','Annotation Extension','Annotation Properties'])
go_annotations['DB'] = go_annotations['DB'] + ", " + go_annotations['Assigned By']
go_annotations.drop(columns=['Assigned By','With (or) From','Interacting taxon ID','Date','Annotation Properties'], inplace=True)
go_annotations['GO ID'] = go_annotations['GO ID'].str.replace(':','_')
go_annotations['DB:Reference(s)'] = go_annotations['DB:Reference(s)'].str.replace('PMID:','')
print(go_annotations['Qualifier'].unique())
print(go_annotations['DB'].unique())
go_annotations['DB'] = go_annotations['DB'].str.split(", ")
go_annotations = go_annotations.explode('DB')
go_annotations['Evidence Code'] = go_annotations['Evidence Code'].str.split(':').str[0]
print(go_annotations['Evidence Code'].unique())

go_annotations['DB:Reference(s)'] = pd.to_numeric(go_annotations['DB:Reference(s)'], errors='coerce')
go_annotations['DB:Reference(s)'] = go_annotations['DB:Reference(s)'].astype(str)
go_annotations['DB:Reference(s)'] = go_annotations['DB:Reference(s)'].str.replace(".0", "")
go_annotations['DB:Reference(s)'] = go_annotations['DB:Reference(s)'].replace("nan", np.nan)

go_annotations['Annotation Extension'] = go_annotations['Annotation Extension'].str.split("(").str[1]
go_annotations['Annotation Extension'] = go_annotations['Annotation Extension'].str.replace(":", "_").str.replace("\)", "",regex=True)
go_annotations = pd.merge(go_annotations, desc_anatomyCell_map.rename(columns={1:'Annotation Extension'}), on='Annotation Extension', how='left')
go_annotations[0] = go_annotations[0].fillna(go_annotations['Annotation Extension'])

go_annotations = go_annotations.merge(unipro_pro_map.rename(columns={0:'DB Object ID'}), on='DB Object ID')

go_annotations = go_annotations.rename(columns={'DB':'Source',1:':START_ID','GO ID':':END_ID',0:'Location',
                                                  'Evidence Code':'GO_evidence','DB:Reference(s)':'PubMedID'})
go_annotations.head(n=3)

* The GO Consortium

In [None]:
pro_go_dict['pro_go_part_of'].head(n=3)

In [None]:
go_annotations = pd.concat([go_annotations,pro_go_dict['pro_go_part_of']])
go_annotations = go_annotations.groupby([':START_ID',':END_ID']).agg({'GO_evidence':set,'Source':set,'PubMedID':set,'Location':set}).reset_index()
go_annotations[':TYPE'] = 'part_of'
go_annotations.to_pickle(unprocessed_edge_data_location+'OBO_part_of_OBO.pkl')

go_annotations = go_annotations.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
go_annotations[':TYPE'] = 'has_part'
go_annotations.to_pickle(unprocessed_edge_data_location+'OBO_has_part_OBO.pkl')
go_annotations.head(n=3)

***
### Gene - http://purl.obolibrary.org/obo/RO_0002434 (interacts with) - OBO

* CTD

In [None]:
data_downloader("https://ctdbase.org/reports/CTD_chem_gene_ixns.tsv.gz", unprocessed_data_location)

In [None]:
ctd_gene = pd.read_csv(unprocessed_data_location+'CTD_chem_gene_ixns.tsv', sep='\t', comment="#",
                          names=['ChemicalName','ChemicalID','CasRN','GeneSymbol','GeneID','GeneForms',
                                 'Organism','OrganismID','Interaction','InteractionActions','PubMedIDs'])
ctd_gene = ctd_gene[ctd_gene['InteractionActions'] != 'affects']
ctd_gene = ctd_gene[ctd_gene['Organism'] == 'Homo sapiens']
ctd_gene = ctd_gene[(ctd_gene['GeneForms'].notna()) & (ctd_gene['GeneForms'].str.startswith("gene"))]
ctd_gene = ctd_gene[['ChemicalID','GeneID','PubMedIDs']]
ctd_gene['ChemicalID'] = 'MESH_' + ctd_gene['ChemicalID']
ctd_gene = pd.merge(ctd_gene, mesh_to_chebi.rename(columns={0:'ChemicalID'}), on='ChemicalID').drop(columns=['ChemicalID'])

ctd_gene['PubMedIDs'] = pd.to_numeric(ctd_gene['PubMedIDs'], errors='coerce')
ctd_gene['PubMedIDs'] = ctd_gene['PubMedIDs'].astype(str)
ctd_gene['PubMedIDs'] = ctd_gene['PubMedIDs'].str.replace(".0", "")
ctd_gene['PubMedIDs'] = ctd_gene['PubMedIDs'].replace("nan", np.nan)

ctd_gene = ctd_gene.rename(columns={'GeneID':':START_ID', 1:':END_ID', 'PubMedIDs':'PubMedID'})
ctd_gene['Source'] = 'CTD'
ctd_gene.head(n=3)

* [RNAInter](http://www.rnainter.org/) <br/> RNAInter integrates experimentally validated and computationally predicted RNA interactome data from the literature and databases.

In [None]:
!wget http://www.rnainter.org/raidMedia/download/Download_data_RP.tar.gz -O ../resources/processed_data/unprocessed_data/Download_data_RP.tar.gz

In [None]:
RNA_protein = pd.read_csv(unprocessed_data_location+'Download_data_RP.tar.gz',sep='\t') # PRO

# We select only strong evidence interactions for hsa
RNA_protein = RNA_protein[(RNA_protein['score'] >= 0.2886) & (RNA_protein['Species1'].str.contains('apiens')) &
                          (RNA_protein['Species2'].str.contains('apiens'))]

print(set(RNA_protein.Category2)) # proteins are all in the second column
print(set(RNA_protein.Category1))

RNA_protein.Raw_ID1 = RNA_protein.Raw_ID1.str.replace("NCBI:", '')
RNA_protein.Raw_ID1 = RNA_protein.Raw_ID1.str.replace("miRBase:", '')
RNA_protein.Raw_ID1 = RNA_protein.Raw_ID1.str.replace("circBase:", '')
RNA_protein.Raw_ID1 = RNA_protein.Raw_ID1.str.replace("Ensembl:", '')

RNA_protein['Raw_ID1'] = RNA_protein['Raw_ID1'].str.split(';')
RNA_protein = RNA_protein.explode('Raw_ID1')
RNA_protein['Raw_ID2'] = RNA_protein['Raw_ID2'].str.split(';')
RNA_protein = RNA_protein.explode('Raw_ID2')

entrez_pro_map[0] = entrez_pro_map[0].astype(str)
RNA_protein = pd.merge(RNA_protein, entrez_pro_map.rename(columns={0: 'Raw_ID2'}), on='Raw_ID2', how ='left')
RNA_protein[1] = RNA_protein[1].fillna(RNA_protein['Raw_ID2'])
RNA_protein.drop(columns=['Raw_ID2'],inplace=True)
RNA_protein.rename(columns={1:'Raw_ID2'},inplace=True)
RNA_protein = pd.merge(RNA_protein, symbol_to_pro.rename(columns={0: 'Interactor2.Symbol'}), on='Interactor2.Symbol', how ='left')
RNA_protein[1] = RNA_protein[1].fillna(RNA_protein['Raw_ID2'])
RNA_protein.drop(columns=['Raw_ID2'],inplace=True)
RNA_protein.rename(columns={1:'Raw_ID2'},inplace=True)
RNA_protein.head(n=3)
RNA_protein = pd.merge(RNA_protein, rnacentral_map_human_ensembl[['Ensembl Gene ID', 'RNAcentral ID']].drop_duplicates(), left_on=['Raw_ID1'],
                   right_on=['Ensembl Gene ID'], how="left").drop(columns=["Ensembl Gene ID"])

RNA_protein['RNAcentral ID'] = RNA_protein['RNAcentral ID'].fillna(RNA_protein['Raw_ID1'])
RNA_protein.drop(columns=['Raw_ID1'],inplace=True)
RNA_protein.rename(columns={'RNAcentral ID':'Raw_ID1'},inplace=True)

RNA_protein.head(n=3)
RNA_protein = pd.merge(RNA_protein, rnacentral_map_human_mirbase[['miRBase ID', 'RNAcentral ID']].drop_duplicates(), left_on=['Raw_ID1'],
                   right_on=['miRBase ID'], how="left").drop(columns=["miRBase ID"])

RNA_protein['RNAcentral ID'] = RNA_protein['RNAcentral ID'].fillna(RNA_protein['Raw_ID1'])
RNA_protein.drop(columns=['Raw_ID1'],inplace=True)
RNA_protein.rename(columns={'RNAcentral ID':'Raw_ID1'},inplace=True)

RNA_protein.head(n=3)
print(RNA_protein[RNA_protein['Category1'] == 'tRNA']['Interactor1.Symbol'].unique()[:3]) 
RNA_protein = pd.merge(RNA_protein, rnacentral_map_human_gtrnadb[['RNAcentral ID','GtRNAdb Gene ID']].drop_duplicates().rename(
    columns={'GtRNAdb Gene ID': 'Interactor1.Symbol'}), on='Interactor1.Symbol', how ='left')
RNA_protein['RNAcentral ID'] = RNA_protein['RNAcentral ID'].fillna(RNA_protein['Raw_ID1'])
RNA_protein.drop(columns=['Raw_ID1'],inplace=True)
RNA_protein.rename(columns={'RNAcentral ID':'Raw_ID1'},inplace=True)
RNA_protein.head(n=3)
i = RNA_protein[(RNA_protein['Category1']=='mRNA')].index.values
mrna = RNA_protein[RNA_protein['Category1']=='mRNA']
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ensembl_entrezTranscript_map_mrna = ensembl_entrezTranscript_map[ensembl_entrezTranscript_map[2] == 'protein-coding']
mrna = pd.merge(mrna, ensembl_entrezTranscript_map_mrna, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1',2,3,4,5]).rename(columns={0:'Raw_ID1'})
mrna.drop(columns=['Raw_ID1'],inplace=True)
mrna.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_protein.drop(index=i,inplace=True,axis=0)
RNA_protein = pd.concat([mrna,RNA_protein])

mrna.head(n=3)
i = RNA_protein[(RNA_protein['Category1']!='miRNA') &(RNA_protein['Category1']!='pseudo') & (RNA_protein['Category1']!='mRNA')& (RNA_protein['Category1']!='miRNA') & (RNA_protein['Category1']!='tRNA')& (RNA_protein['Category1']!='rRNA')].index.values
ncrna = RNA_protein[(RNA_protein['Category1']!='miRNA') &(RNA_protein['Category1']!='pseudo') & (RNA_protein['Category1']!='mRNA')& (RNA_protein['Category1']!='miRNA') & (RNA_protein['Category1']!='tRNA')& (RNA_protein['Category1']!='rRNA')]
rnacentral_map_human_hgnc_type = rnacentral_map_hgnc[rnacentral_map_hgnc['Organism'] ==9606]
ncrna = pd.merge(ncrna, rnacentral_map_human_hgnc_type, left_on=['Interactor1.Symbol','Category1'],
                right_on=["HGNC symbol",'RNA category']).drop(columns=["Interactor1.Symbol",'Raw_ID1','RNA category'])
ncrna.rename(columns={"RNAcentral ID":'Raw_ID1'}, inplace=True)
RNA_protein.drop(index=i,inplace=True,axis=0)
RNA_protein = pd.concat([ncrna,RNA_protein])

ncrna.head(n=3)
# Start mislabeled data
RNA_RNA = pd.read_csv(unprocessed_data_location+'Download_data_RR.tar.gz',sep='\t').rename(columns={'Download_data_RR.txt':'RNAInterID'})

# We select only strong evidence interactions for hsa
RNA_RNA = RNA_RNA[(RNA_RNA['score'] >= 0.2886) &
                  (RNA_RNA['Species1'].str.contains('apiens')) &
                  (RNA_RNA['Species2'].str.contains('apiens'))]

RNA_RNA.Category1 = RNA_RNA.Category1.str.replace("PCG", 'mRNA')
RNA_RNA.Category2 = RNA_RNA.Category2.str.replace("PCG", 'mRNA')

RNA_RNA.Raw_ID1 = RNA_RNA.Raw_ID1.str.replace("NCBI:", '')
RNA_RNA.Raw_ID2 = RNA_RNA.Raw_ID2.str.replace("NCBI:", '')

RNA_RNA.Raw_ID1 = RNA_RNA.Raw_ID1.str.replace("miRBase:", '')
RNA_RNA.Raw_ID2 = RNA_RNA.Raw_ID2.str.replace("miRBase:", '')

RNA_RNA.Raw_ID1 = RNA_RNA.Raw_ID1.str.replace("circBase:", '')
RNA_RNA.Raw_ID2 = RNA_RNA.Raw_ID2.str.replace("circBase:", '')

# tRF
RNA_RNA.Raw_ID1 = RNA_RNA.Raw_ID1.str.replace("tRFdb:", 'trfdb?')
RNA_RNA.Raw_ID2 = RNA_RNA.Raw_ID2.str.replace("tRFdb:", 'trfdb?')

RNA_RNA['Raw_ID1'] = RNA_RNA['Raw_ID1'].str.split(';')
RNA_RNA = RNA_RNA.explode('Raw_ID1')
RNA_RNA['Raw_ID2'] = RNA_RNA['Raw_ID2'].str.split(';')
RNA_RNA = RNA_RNA.explode('Raw_ID2')

RNA_RNA['Interactor1.Symbol'] = RNA_RNA['Interactor1.Symbol'].str.split('.').str[0]
RNA_RNA['Interactor2.Symbol'] = RNA_RNA['Interactor2.Symbol'].str.split('.').str[0]

RNA_RNA.head(n=3)
a = set(RNA_RNA.Category1)
b = set(RNA_RNA.Category2)
a.union(b)
RNA_RNA = pd.merge(RNA_RNA, rnacentral_map_human_mirbase[['miRBase ID', 'RNAcentral ID']].drop_duplicates(), left_on=['Raw_ID1'],
                   right_on=['miRBase ID'], how="left").drop(columns=["miRBase ID"])

RNA_RNA['RNAcentral ID'] = RNA_RNA['RNAcentral ID'].fillna(RNA_RNA['Raw_ID1'])
RNA_RNA.drop(columns=['Raw_ID1'],inplace=True)
RNA_RNA.rename(columns={'RNAcentral ID':'Raw_ID1'},inplace=True)

RNA_RNA = pd.merge(RNA_RNA, rnacentral_map_human_mirbase[['miRBase ID', 'RNAcentral ID']].drop_duplicates(), left_on=['Raw_ID2'],
                   right_on=['miRBase ID'], how="left").drop(columns=["miRBase ID"])

RNA_RNA['RNAcentral ID'] = RNA_RNA['RNAcentral ID'].fillna(RNA_RNA['Raw_ID2'])
RNA_RNA.drop(columns=['Raw_ID2'],inplace=True)
RNA_RNA.rename(columns={'RNAcentral ID':'Raw_ID2'},inplace=True)

RNA_RNA.head(n=2)
print(RNA_RNA[RNA_RNA['Category2'] == 'piRNA'])
print(RNA_RNA[RNA_RNA['Category2'] == 'piRNA'])
RNA_RNA.loc[31571, 'Raw_ID2'] = 'piR-hsa-39980'
RNA_RNA.loc[39194, 'Raw_ID2'] = 'piR-hsa-20280'
print(RNA_RNA[RNA_RNA['Category1'] == 'tRNA'])
print(RNA_RNA[RNA_RNA['Category1'] == 'tRNA'])
RNA_RNA.loc[23191, 'Raw_ID1'] = 'URS0000287398'
RNA_RNA.loc[23192, 'Raw_ID1'] = 'URS00003C9A26'
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("transcribed_unitary_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("transcribed_unprocessed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("transcribed_processed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("unprocessed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("processed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("IG_V_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("unitary_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("TR_J_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("TR_V_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("IG_C_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("IG_J_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("translated_processed_pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("pseudogene","pseudo")
ensembl_map['ensembl_transcript_type'] = ensembl_map['ensembl_transcript_type'].str.replace("IG_pseudogene","pseudo")
i = RNA_RNA[(RNA_RNA['Category1']=='pseudo')].index.values
pseudo = RNA_RNA[RNA_RNA['Category1']=='pseudo']
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ensembl_entrezTranscript_map_pseudo = ensembl_entrezTranscript_map[ensembl_entrezTranscript_map[2] == 'pseudo']
pseudo = pd.merge(pseudo, ensembl_entrezTranscript_map_pseudo, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1',2,3,4,5]).rename(columns={0:'Raw_ID1'})
pseudo.drop(columns=['Raw_ID1'],inplace=True)
pseudo.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([pseudo,RNA_RNA])

i = RNA_RNA[RNA_RNA['Category2']=='pseudo'].index.values
pseudo = RNA_RNA[RNA_RNA['Category2']=='pseudo']
pseudo = pd.merge(pseudo, ensembl_entrezTranscript_map_pseudo, left_on=['Raw_ID2'],
                right_on=[0]).drop(columns=['Raw_ID2',2,3,4,5]).rename(columns={0:'Raw_ID2'})
pseudo.drop(columns=['Raw_ID2'],inplace=True)
pseudo.rename(columns={1:'Raw_ID2'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([pseudo,RNA_RNA])

pseudo.head(n=3)
i = RNA_RNA[(RNA_RNA['Category1']=='mRNA')].index.values
mrna = RNA_RNA[RNA_RNA['Category1']=='mRNA']
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ensembl_entrezTranscript_map_mrna = ensembl_entrezTranscript_map[ensembl_entrezTranscript_map[2] == 'protein-coding']
mrna = pd.merge(mrna, ensembl_entrezTranscript_map_mrna, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1',2,3,4,5]).rename(columns={0:'Raw_ID1'})
mrna.drop(columns=['Raw_ID1'],inplace=True)
mrna.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([mrna,RNA_RNA])

i = RNA_RNA[RNA_RNA['Category2']=='mRNA'].index.values
mrna = RNA_RNA[RNA_RNA['Category2']=='mRNA']
mrna = pd.merge(mrna, ensembl_entrezTranscript_map_mrna, left_on=['Raw_ID2'],
                right_on=[0]).drop(columns=['Raw_ID2',2,3,4,5]).rename(columns={0:'Raw_ID2'})
mrna.drop(columns=['Raw_ID2'],inplace=True)
mrna.rename(columns={1:'Raw_ID2'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([mrna,RNA_RNA])

mrna.head(n=3)
i = RNA_RNA[(RNA_RNA['Category1']=='protein') | (RNA_RNA['Category1']=='TF')].index.values
protein = RNA_RNA[(RNA_RNA['Category1']=='protein') | (RNA_RNA['Category1']=='TF')]
entrez_pro_map[0] = entrez_pro_map[0].astype(str)
protein = pd.merge(protein, entrez_pro_map, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1']).rename(columns={0:'Raw_ID1'})
protein.drop(columns=['Raw_ID1'],inplace=True)
protein.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([protein,RNA_RNA])

i = RNA_RNA[(RNA_RNA['Category2']=='protein') | (RNA_RNA['Category2']=='TF')].index.values
protein = RNA_RNA[(RNA_RNA['Category2']=='protein') | (RNA_RNA['Category2']=='TF')]
protein = pd.merge(protein, entrez_pro_map, left_on=['Raw_ID2'],
                right_on=[0]).drop(columns=['Raw_ID2']).rename(columns={0:'Raw_ID2'})
protein.drop(columns=['Raw_ID2'],inplace=True)
protein.rename(columns={1:'Raw_ID2'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([protein,RNA_RNA])

protein.head(n=3)
i = RNA_RNA[(RNA_RNA['Category1']!='protein') & (RNA_RNA['Category1']!='TF') & (RNA_RNA['Category1']!='mRNA') & 
            (RNA_RNA['Category1']!='pseudo') & (RNA_RNA['Category1']!='piRNA') & (RNA_RNA['Category1']!='miRNA')
            & (RNA_RNA['Category1']!='eRNA') & (RNA_RNA['Category1']!='circRNA') & (RNA_RNA['Category1']!='tRF')].index.values
ncrna = RNA_RNA[(RNA_RNA['Category1']!='protein') & (RNA_RNA['Category1']!='TF') & (RNA_RNA['Category1']!='mRNA')
            & (RNA_RNA['Category1']!='pseudo') & (RNA_RNA['Category1']!='piRNA') & (RNA_RNA['Category1']!='miRNA')
            & (RNA_RNA['Category1']!='eRNA') & (RNA_RNA['Category1']!='circRNA') & (RNA_RNA['Category1']!='tRF')]
rnacentral_map_human_hgnc_type = rnacentral_map_hgnc[rnacentral_map_hgnc['Organism'] ==9606]
ncrna = pd.merge(ncrna, rnacentral_map_human_hgnc_type, left_on=['Interactor1.Symbol','Category1'],
                right_on=["HGNC symbol",'RNA category']).drop(columns=["Interactor1.Symbol",'Raw_ID1','RNA category'])
ncrna.rename(columns={"RNAcentral ID":'Raw_ID1'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([ncrna,RNA_RNA])

i = RNA_RNA[(RNA_RNA['Category2']!='protein') & (RNA_RNA['Category2']!='TF') & (RNA_RNA['Category2']!='mRNA')
            & (RNA_RNA['Category2']!='pseudo') & (RNA_RNA['Category2']!='piRNA') & (RNA_RNA['Category2']!='miRNA')
            & (RNA_RNA['Category2']!='eRNA') & (RNA_RNA['Category2']!='circRNA') & (RNA_RNA['Category2']!='tRF')].index.values
ncrna = RNA_RNA[(RNA_RNA['Category2']!='protein') & (RNA_RNA['Category2']!='TF') & (RNA_RNA['Category2']!='mRNA')
            & (RNA_RNA['Category2']!='pseudo') & (RNA_RNA['Category2']!='piRNA') & (RNA_RNA['Category2']!='miRNA')
            & (RNA_RNA['Category2']!='eRNA') & (RNA_RNA['Category2']!='circRNA') & (RNA_RNA['Category2']!='tRF')]
ncrna = pd.merge(ncrna, rnacentral_map_human_hgnc_type, left_on=['Interactor2.Symbol','Category2'],
                right_on=["HGNC symbol",'RNA category']).drop(columns=["Interactor2.Symbol",'Raw_ID2','RNA category'])
ncrna.rename(columns={"RNAcentral ID":'Raw_ID2'}, inplace=True)
RNA_RNA.drop(index=i,inplace=True,axis=0)
RNA_RNA = pd.concat([ncrna,RNA_RNA])
RNA_RNA = RNA_RNA[['Raw_ID1','Raw_ID2','Category1','Category2','score','strong','weak','predict']]

ncrna.head(n=3)
a = set(RNA_RNA.Category1)
b = set(RNA_RNA.Category2)
a.union(b)
# Mislabeled data
RNA_protein_ = RNA_RNA[(RNA_RNA['Raw_ID1'].str.startswith("PR")) | (RNA_RNA['Raw_ID2'].str.startswith("PR"))]
# End mislabeled data
RNA_protein = RNA_protein[(RNA_protein['Raw_ID1'].str.startswith("URS")) | (RNA_protein['Raw_ID1'].str.startswith("ENST")) |
                          (RNA_protein['Raw_ID1'].str.startswith("hsa_circ")) | RNA_protein['Raw_ID1'].str[0].str.isdigit()]

RNA_protein = pd.concat([RNA_protein_, RNA_protein])

RNA_protein['Method'] = RNA_protein['strong'].astype(str) + '//' + RNA_protein['weak'].astype(str) + '//' + RNA_protein['predict'].astype(str)
RNA_protein['Method'] = RNA_protein['Method'].str.lower()
RNA_protein['Method'] = RNA_protein['Method'].str.replace('nan\/\/','',regex=True)
RNA_protein['Method'] = RNA_protein['Method'].str.replace('\/\/nan','',regex=True)
RNA_protein['Method'] = RNA_protein['Method'].replace('nan',np.nan)
RNA_protein['Method'] = RNA_protein['Method'].str.split('//')
RNA_protein = RNA_protein.explode('Method')
RNA_protein = pd.merge(RNA_protein, method_map, right_on='0_y', left_on='Method', how='left')
RNA_protein['0_x'] = RNA_protein['0_x'].fillna(RNA_protein['Method'])
RNA_protein = RNA_protein.drop(columns=['0_y', 'Method'])
RNA_protein = RNA_protein.rename(columns={'0_x':'Method'})

RNA_protein['Source'] = 'RNAInter'
RNA_protein = RNA_protein.rename(columns={'Raw_ID1':':START_ID', 'Raw_ID2':':END_ID', 'score':'RNAsister_score'})
gene_protein = RNA_protein[RNA_protein[':START_ID'].str[0].str.isdigit()]
RNA_protein = RNA_protein[~RNA_protein[':START_ID'].str[0].str.isdigit()]
gene_protein.head(n=3)

* [RNAInter](http://www.rnainter.org/) <br/> RNAInter integrates experimentally validated and computationally predicted RNA interactome data from the literature and databases.

In [None]:
!wget http://www.rnainter.org/raidMedia/download/Download_data_RC.tar.gz -O ../resources/processed_data/unprocessed_data/Download_data_RC.tar.gz

In [None]:
RNA_chemical = pd.read_csv(unprocessed_data_location+'Download_data_RC.tar.gz',sep='\t').rename(columns={'Download_data_RC.txt':'RNAInterID'})

# We select only strong evidence interactions for hsa
RNA_chemical = RNA_chemical[(RNA_chemical['score'] >= 0.2886) &
                  (RNA_chemical['Species1'].str.contains('apiens'))]

print(set(RNA_chemical.Category2)) # Chemicals are all in the second column
print(set(RNA_chemical.Category1))

RNA_chemical['Raw_ID1'] = RNA_chemical['Raw_ID1'].str.split(';')
RNA_chemical = RNA_chemical.explode('Raw_ID1')
RNA_chemical['Interactor1.Symbol'] = RNA_chemical['Interactor1.Symbol'].str.split('.').str[0]
RNA_chemical = RNA_chemical.explode('Interactor1.Symbol')
RNA_chemical['Raw_ID2'] = RNA_chemical['Raw_ID2'].str.split(';')
RNA_chemical = RNA_chemical.explode('Raw_ID2')
RNA_chemical['Raw_ID1'] = RNA_chemical['Raw_ID1'].str.strip()
RNA_chemical['Raw_ID2'] = RNA_chemical['Raw_ID2'].str.strip()

RNA_chemical = RNA_chemical[(RNA_chemical['Interactor2.Symbol'].notna())]
RNA_chemical['Interactor2.Symbol'] = RNA_chemical['Interactor2.Symbol'].str.lower()
RNA_chemical = pd.merge(RNA_chemical, desc_chebi_map.rename(columns={0: 'Interactor2.Symbol'}), on='Interactor2.Symbol',
                        how='left')
RNA_chemical[1].fillna(RNA_chemical['Interactor2.Symbol'], inplace=True)
RNA_chemical.drop(columns=['Interactor2.Symbol'],inplace=True)
RNA_chemical.rename(columns={1:'Interactor2.Symbol'},inplace=True)
RNA_chemical = pd.merge(RNA_chemical, desc_drugbank_map.rename(columns={0: 'Interactor2.Symbol'}), left_on=['Interactor2.Symbol'],
                        right_on=['Interactor2.Symbol'], how='left')
RNA_chemical[1].fillna(RNA_chemical['Interactor2.Symbol'], inplace=True)
RNA_chemical.drop(columns=['Interactor2.Symbol'],inplace=True)
RNA_chemical.rename(columns={1:'Interactor2.Symbol'},inplace=True)

RNA_chemical.Raw_ID1 = RNA_chemical.Raw_ID1.str.replace("NCBI:", '')
RNA_chemical.Raw_ID1 = RNA_chemical.Raw_ID1.str.replace("miRBase:", '')
RNA_chemical.Raw_ID1 = RNA_chemical.Raw_ID1.str.replace("Ensembl:", '')
RNA_chemical.Raw_ID1 = RNA_chemical.Raw_ID1.str.replace("circBase:", '')

RNA_chemical.head(n=3)
RNA_chemical = pd.merge(RNA_chemical, rnacentral_map_human_mirbase[['miRBase ID', 'RNAcentral ID']].drop_duplicates(), left_on=['Raw_ID1'],
                   right_on=['miRBase ID'], how="left").drop(columns=["miRBase ID"])

RNA_chemical['RNAcentral ID'] = RNA_chemical['RNAcentral ID'].fillna(RNA_chemical['Raw_ID1'])
RNA_chemical.drop(columns=['Raw_ID1'],inplace=True)
RNA_chemical.rename(columns={'RNAcentral ID':'Raw_ID1'},inplace=True)

RNA_chemical.head(n=3)
i = RNA_chemical[(RNA_chemical['Category1']=='mRNA')].index.values
mrna = RNA_chemical[RNA_chemical['Category1']=='mRNA']
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ensembl_entrezTranscript_map_mrna = ensembl_entrezTranscript_map[ensembl_entrezTranscript_map[2] == 'protein-coding']
mrna = pd.merge(mrna, ensembl_entrezTranscript_map_mrna, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1',2,3,4,5]).rename(columns={0:'Raw_ID1'})
mrna.drop(columns=['Raw_ID1'],inplace=True)
mrna.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_chemical.drop(index=i,inplace=True,axis=0)
RNA_chemical = pd.concat([mrna,RNA_chemical])

mrna.head(n=3) 
i = RNA_chemical[(RNA_chemical['Category1']=='pseudo')].index.values
pseudo = RNA_chemical[RNA_chemical['Category1']=='pseudo']
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ensembl_entrezTranscript_map_pseudo = ensembl_entrezTranscript_map[ensembl_entrezTranscript_map[2] == 'pseudogene']
pseudo = pd.merge(pseudo, ensembl_entrezTranscript_map_pseudo, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1',2,3,4,5]).rename(columns={0:'Raw_ID1'})
pseudo.drop(columns=['Raw_ID1'],inplace=True)
pseudo.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_chemical.drop(index=i,inplace=True,axis=0)
RNA_chemical = pd.concat([pseudo,RNA_chemical])

pseudo.head(n=3) 
i = RNA_chemical[(RNA_chemical['Category1']!='mRNA') & (RNA_chemical['Category1']!='pseudo')].index.values
ncrna = RNA_chemical[(RNA_chemical['Category1']!='mRNA') & (RNA_chemical['Category1']!='pseudo')]
rnacentral_map_human_hgnc_type = rnacentral_map_hgnc[rnacentral_map_hgnc['Organism'] ==9606]
ncrna = pd.merge(ncrna, rnacentral_map_human_hgnc_type, left_on=['Interactor1.Symbol','Category1'],
                right_on=["HGNC symbol",'RNA category']).drop(columns=["Interactor1.Symbol",'Raw_ID1','RNA category'])
ncrna.rename(columns={"RNAcentral ID":'Raw_ID1'}, inplace=True)
RNA_chemical.drop(index=i,inplace=True,axis=0)
RNA_chemical = pd.concat([ncrna,RNA_chemical])

ncrna.head(n=3)
RNA_chemical = RNA_chemical[(RNA_chemical['Raw_ID1'].str.startswith("URS")) | (RNA_chemical['Raw_ID1'].str.startswith("ENST")) | 
                            (RNA_chemical['Raw_ID1'].str.startswith("hsa_circ")) | (RNA_chemical['Raw_ID1'].str[0].str.isdigit())]
RNA_chemical = RNA_chemical[(RNA_chemical['Interactor2.Symbol'].str.startswith("DB")) | 
                            (RNA_chemical['Interactor2.Symbol'].str.startswith("CHEBI"))]

RNA_chemical['Method'] = RNA_chemical['strong'].astype(str) + '//' + RNA_chemical['weak'].astype(str) + '//' + RNA_chemical['predict'].astype(str)
RNA_chemical['Method'] = RNA_chemical['Method'].str.lower()
RNA_chemical['Method'] = RNA_chemical['Method'].str.replace('nan\/\/','',regex=True)
RNA_chemical['Method'] = RNA_chemical['Method'].str.replace('\/\/nan','',regex=True)
RNA_chemical['Method'] = RNA_chemical['Method'].replace('nan',np.nan)
RNA_chemical['Method'] = RNA_chemical['Method'].str.split('//')
RNA_chemical = RNA_chemical.explode('Method')
RNA_chemical = pd.merge(RNA_chemical, method_map, right_on='0_y', left_on='Method', how='left')
RNA_chemical['0_x'] = RNA_chemical['0_x'].fillna(RNA_chemical['Method'])
RNA_chemical = RNA_chemical.drop(columns=['0_y', 'Method'])
RNA_chemical = RNA_chemical.rename(columns={'0_x':'Method'})

RNA_chemical['Source'] = 'RNAInter'
RNA_chemical.rename(columns={'Raw_ID1':':START_ID', 'Interactor2.Symbol':':END_ID', 'score':'RNAsister_score'},inplace=True)
gene_chemical = RNA_chemical[RNA_chemical[':START_ID'].str[0].str.isdigit()]
RNA_chemical = RNA_chemical[~RNA_chemical[':START_ID'].str[0].str.isdigit()]
gene_chemical.head(n=3)

* [RNAInter](http://www.rnainter.org/) <br/> RNAInter integrates experimentally validated and computationally predicted RNA interactome data from the literature and databases.

In [None]:
!wget http://www.rnainter.org/raidMedia/download/Download_data_RH.tar.gz -O ../resources/processed_data/unprocessed_data/Download_data_RH.tar.gz

In [None]:
RNA_hisMod = pd.read_csv(unprocessed_data_location+'Download_data_RH.tar.gz',sep='\t').rename(columns={'Download_data_RH.txt':'RNAInterID'})
# Histone modification (SO)

# We select only strong evidence interactions for hsa
RNA_hisMod = RNA_hisMod[(RNA_hisMod['score'] >= 0.2886) &
                  (RNA_hisMod['Species1'].str.contains('apiens'))]

print(set(RNA_hisMod.Category2)) # Histone modifications are all in the second column
print(set(RNA_hisMod.Category1))

RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.lower()
print(RNA_hisMod['Interactor2.Symbol'].unique())
RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.lower().str.replace('h3k9-14ac','h3k9ac, h3k14ac')
RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.lower().str.replace('hist2h3c','histone acetylation site')
#RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.lower().str.replace('h2afz','h2azac')
RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.lower().str.replace('h3ace','h3ac')
RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.lower().str.replace('h3k27me1','h2k27me1')
RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.lower().str.replace('h3.3','histone modification')
RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.lower().str.replace('h3k4','h3k4 methylation site')

RNA_hisMod['Interactor2.Symbol'] = RNA_hisMod['Interactor2.Symbol'].str.split(', ')
RNA_hisMod = RNA_hisMod.explode('Interactor2.Symbol')

RNA_hisMod = pd.merge(RNA_hisMod, desc_so_map, left_on=['Interactor2.Symbol'], right_on=0)
RNA_hisMod = RNA_hisMod.drop(columns=['Interactor2.Symbol', 0])
RNA_hisMod = RNA_hisMod.rename(columns={1:'Interactor2.Symbol'})

print(set(RNA_hisMod.Category1))

RNA_hisMod.Raw_ID1 = RNA_hisMod.Raw_ID1.str.replace("NCBI:", '')
RNA_hisMod.Raw_ID1 = RNA_hisMod.Raw_ID1.str.replace("miRBase:", '')

RNA_hisMod['Raw_ID1'] = RNA_hisMod['Raw_ID1'].str.split(';')
RNA_hisMod = RNA_hisMod.explode('Raw_ID1')
RNA_hisMod['Interactor1.Symbol'] = RNA_hisMod['Interactor1.Symbol'].str.split('.').str[0]

RNA_hisMod.head(n=3)
RNA_hisMod = pd.merge(RNA_hisMod, rnacentral_map_human_mirbase[['miRBase ID', 'RNAcentral ID']].drop_duplicates(), left_on=['Raw_ID1'],
                   right_on=['miRBase ID'], how="left").drop(columns=["miRBase ID"])

RNA_hisMod['RNAcentral ID'] = RNA_hisMod['RNAcentral ID'].fillna(RNA_hisMod['Raw_ID1'])
RNA_hisMod.drop(columns=['Raw_ID1'],inplace=True)
RNA_hisMod.rename(columns={'RNAcentral ID':'Raw_ID1'},inplace=True)

RNA_hisMod.head(n=3)
i = RNA_hisMod[(RNA_hisMod['Category1']=='mRNA')].index.values
mrna = RNA_hisMod[RNA_hisMod['Category1']=='mRNA']
ensembl_entrezTranscript_map[0] = ensembl_entrezTranscript_map[0].astype(str)
ensembl_entrezTranscript_map_mrna = ensembl_entrezTranscript_map[ensembl_entrezTranscript_map[2] == 'protein-coding']
mrna = pd.merge(mrna, ensembl_entrezTranscript_map_mrna, left_on=['Raw_ID1'],
                right_on=[0]).drop(columns=['Raw_ID1',2,3,4,5]).rename(columns={0:'Raw_ID1'})
mrna.drop(columns=['Raw_ID1'],inplace=True)
mrna.rename(columns={1:'Raw_ID1'}, inplace=True)
RNA_hisMod.drop(index=i,inplace=True,axis=0)
RNA_hisMod = pd.concat([mrna,RNA_hisMod])

mrna.head(n=3) 
i = RNA_hisMod[(RNA_hisMod['Category1']!='mRNA') & (RNA_hisMod['Category1']!='pseudo')].index.values
ncrna = RNA_hisMod[(RNA_hisMod['Category1']!='mRNA') & (RNA_hisMod['Category1']!='pseudo')]
rnacentral_map_human_hgnc_type = rnacentral_map_hgnc[rnacentral_map_hgnc['Organism'] ==9606]
ncrna = pd.merge(ncrna, rnacentral_map_human_hgnc_type, left_on=['Interactor1.Symbol','Category1'],
                right_on=["HGNC symbol",'RNA category']).drop(columns=["Interactor1.Symbol",'Raw_ID1','RNA category'])
ncrna.rename(columns={"RNAcentral ID":'Raw_ID1'}, inplace=True)
RNA_hisMod.drop(index=i,inplace=True,axis=0)
RNA_hisMod = pd.concat([ncrna,RNA_hisMod])

ncrna.head(n=3)
RNA_hisMod = RNA_hisMod[(RNA_hisMod['Raw_ID1'].str.startswith("URS")) | (RNA_hisMod['Raw_ID1'].str.startswith("ENST")) |
                            (RNA_hisMod['Raw_ID1'].str.startswith("hsa_circ")) | (RNA_hisMod['Raw_ID1'].str[0].str.isdigit())]

RNA_hisMod['Method'] = RNA_hisMod['strong'].astype(str) + '//' + RNA_hisMod['weak'].astype(str) + '//' + RNA_hisMod['predict'].astype(str)
RNA_hisMod['Method'] = RNA_hisMod['Method'].str.lower()
RNA_hisMod['Method'] = RNA_hisMod['Method'].str.replace('nan\/\/','',regex=True)
RNA_hisMod['Method'] = RNA_hisMod['Method'].str.replace('\/\/nan','',regex=True)
RNA_hisMod['Method'] = RNA_hisMod['Method'].replace('nan',np.nan)
RNA_hisMod['Method'] = RNA_hisMod['Method'].str.split('//')
RNA_hisMod = RNA_hisMod.explode('Method')
RNA_hisMod = pd.merge(RNA_hisMod, method_map, right_on='0_y', left_on='Method', how='left')
RNA_hisMod['0_x'] = RNA_hisMod['0_x'].fillna(RNA_hisMod['Method'])
RNA_hisMod = RNA_hisMod.drop(columns=['0_y', 'Method'])
RNA_hisMod = RNA_hisMod.rename(columns={'0_x':'Method'})

RNA_hisMod['Source'] = 'RNAInter'
gene_hisMod = RNA_hisMod[RNA_hisMod['Raw_ID1'].str[0].str.isdigit()]
gene_hisMod.rename(columns={'Raw_ID1':':START_ID', 'Interactor2.Symbol':':END_ID', 'score':'RNAsister_score'},inplace=True)
gene_hisMod.head(n=3)

In [None]:
gene_interacts_with_OBO = pd.concat([ctd_gene, gene_protein, gene_chemical, gene_hisMod])
gene_interacts_with_OBO = gene_interacts_with_OBO.groupby([':START_ID',':END_ID']).agg({'RNAsister_score':np.mean,'Source':set,
                                                                                        'PubMedID':set,'Method':set}).reset_index()
gene_interacts_with_OBO[':TYPE'] = 'interacts_with'
gene_interacts_with_OBO.to_pickle(unprocessed_edge_data_location+'gene_interacts_with_OBO.pkl')

OBO_interacts_with_gene = gene_interacts_with_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_interacts_with_gene.to_pickle(unprocessed_edge_data_location+'OBO_interacts_with_gene.pkl')
OBO_interacts_with_gene.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002434 (enables) - OBO

* The GO consortium

In [None]:
pro_go = pd.read_csv(unprocessed_data_location+"goa_human.gaf",comment="!",sep="\t",header=None)
pro_go = pro_go[pro_go[12] == 'taxon:9606']
pro_go[14] = 'GO_Central, ' + pro_go[14]
pro_go = pro_go[[1,3,4,6,14]]
pro_go[4] = pro_go[4].str.replace('GO:','GO_')
pro_go = pd.merge(pro_go, unipro_pro_map, left_on=1, right_on=0).drop(columns=[1,'1_x',0])
pro_go[14] = pro_go[14].str.replace(", UniProt",", UniProtKB")
pro_go[14] = pro_go[14].str.replace(", IntAct","")
pro_go[14] = pro_go[14].str.replace(", HPA",", The_Human_Protein_Atlas")
pro_go[14] = pro_go[14].str.replace(", GOC","")
pro_go[14] = pro_go[14].str.replace(", FlyBase","")
pro_go[14] = pro_go[14].str.replace(", NTNU_SB","")
pro_go[14] = pro_go[14].str.replace(", ComplexPortal","")
pro_go[14] = pro_go[14].str.replace(", ParkinsonsUK-UCL","")
pro_go[14] = pro_go[14].str.replace(", ARUK-UCL","")
pro_go[14] = pro_go[14].str.replace(", LIFEdb","")
pro_go[14] = pro_go[14].str.replace(", BHF-UCL","")
pro_go[14] = pro_go[14].str.replace(", MGI","")
pro_go[14] = pro_go[14].str.replace(", RHEA","")
pro_go[14] = pro_go[14].str.replace(", HGNC-UCL",", HGNC")
pro_go[14] = pro_go[14].str.replace(", SYSCILIA_CCNET","")
pro_go[14] = pro_go[14].str.replace(", CACAO","")
pro_go[14] = pro_go[14].str.replace(", AgBase","")
pro_go[14] = pro_go[14].str.replace(", PINC","")
pro_go[14] = pro_go[14].str.replace(", CAFA","")
pro_go[14] = pro_go[14].str.replace(", DisProt","")
pro_go[14] = pro_go[14].str.replace(", MTBBASE","")
pro_go[14] = pro_go[14].str.replace(", YuBioLab","")
pro_go[14] = pro_go[14].str.replace(", SynGO","")
pro_go[14] = pro_go[14].str.replace(", Alzheimers_University_of_Toronto","")
pro_go[14] = pro_go[14].str.replace(", GDB","")
pro_go[14] = pro_go[14].str.replace(", SynGO-UCL","")
pro_go[14] = pro_go[14].str.replace(", DFLAT","")
pro_go[14] = pro_go[14].str.replace(", DIBU","")
pro_go[14] = pro_go[14].str.replace(", PHI-base","")
pro_go[14] = pro_go[14].str.replace(", WB","")
pro_go[14] = pro_go[14].str.replace(", Xenbase","")
pro_go[14] = pro_go[14].str.replace(", ZFIN","")
pro_go[14] = pro_go[14].str.replace(", dictyBase","")
pro_go[14] = pro_go[14].str.replace(", InterPro","")
pro_go[14] = pro_go[14].str.replace("GO_Central-UCL","GO_Central")
pro_go[14] = pro_go[14].str.split(", ")
pro_go = pro_go.explode(14)
pro_go[14] = pro_go[14].str.replace("GO_Central","GOC")
pro_go = pro_go.rename(columns={'1_y':':START_ID',4:':END_ID',6:'GO_evidence',14:'Source'})

pro_go_dict = {}
for i in pro_go[3].unique():
    pro_go_dict[f'pro_go_{i}'] = pro_go[pro_go[3] == i].drop(columns=[3])
print(pro_go[3].unique())
pro_go_dict['pro_go_enables'].head(n=3)

In [None]:
pro_go = pro_go_dict['pro_go_enables'].groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'enables'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_enables_OBO.pkl')

pro_go = pro_go.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
pro_go[':TYPE'] = 'enabled_by'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_enabled_by_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002432 (is active in) - OBO

* The GO consortium

In [None]:
pro_go = pro_go_dict['pro_go_is_active_in'].groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'is_active_in'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_is_active_in_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002325 (colocalizes with) - OBO

* The GO consortium

In [None]:
pro_go = pd.concat([pro_go_dict['pro_go_colocalizes_with'],
                    pro_go_dict['pro_go_colocalizes_with'].rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})])

pro_go = pro_go.groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'colocalizes_with'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_colocalizes_with_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002264 (acts upstream of or within) - OBO

* The GO consortium

In [None]:
pro_go = pro_go_dict['pro_go_acts_upstream_of_or_within']

pro_go = pro_go.groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'acts_upstream_of_or_within'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_acts_upstream_of_or_within_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002326 (contributes to) - OBO

* The GO consortium

In [None]:
pro_go = pro_go_dict['pro_go_contributes_to']

pro_go = pro_go.groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'contributes_to'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_contributes_to_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0004035 (acts upstream of negative effect) - OBO

* The GO consortium

In [None]:
pro_go = pro_go_dict['pro_go_acts_upstream_of_negative_effect']

pro_go = pro_go.groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'acts_upstream_of_negative_effect'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_acts_upstream_of_negative_effect_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0004032 (acts upstream of or within positive effect) - OBO

* The GO consortium

In [None]:
pro_go = pro_go_dict['pro_go_acts_upstream_of_or_within_positive_effect']

pro_go = pro_go.groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'acts_upstream_of_or_within_positive_effect'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_acts_upstream_of_or_within_positive_effect_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0004033 (acts upstream of or within negative effect) - OBO

* The GO consortium

In [None]:
pro_go = pro_go_dict['pro_go_acts_upstream_of_or_within_negative_effect']

pro_go = pro_go.groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'acts_upstream_of_or_within_negative_effect'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_acts_upstream_of_or_within_negative_effect_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0004034 (acts upstream of positive effect) - OBO

* The GO consortium

In [None]:
pro_go = pro_go_dict['pro_go_acts_upstream_of_positive_effect']

pro_go = pro_go.groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'acts_upstream_of_positive_effect'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_acts_upstream_of_positive_effect_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/BFO_0000050?NOT (not part of) - OBO

* The GO consortium

In [None]:
pro_go = pro_go_dict['pro_go_NOT|part_of'].groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'not_part_of'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_not_part_of_OBO.pkl')

pro_go = pro_go.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
pro_go[':TYPE'] = 'not_has_part'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_not_has_part_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002327?NOT (not enables) - OBO

* The GO consortium

In [None]:
pro_go = pro_go_dict['pro_go_NOT|enables'].groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'not_enables'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_not_enables_OBO.pkl')

pro_go = pro_go.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
pro_go[':TYPE'] = 'not_enabled_by'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_not_enabled_by_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0001025?NOT (not located in) - OBO

* The GO consortium

In [None]:
pro_go = pro_go_dict['pro_go_NOT|located_in'].groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'not_located_in'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_not_located_in_OBO.pkl')

pro_go = pro_go.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
pro_go[':TYPE'] = 'not_location_of'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_not_location_of_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002331?NOT (not involved in) - OBO

* The GO consortium

In [None]:
pro_go = pro_go_dict['pro_go_NOT|involved_in'].groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'not_involved_in'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_not_involved_in_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002432?NOT (not is active in) - OBO

* The GO consortium

In [None]:
pro_go = pro_go_dict['pro_go_NOT|is_active_in'].groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'not_is_active_in'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_not_is_active_in_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002325?NOT (not colocalizes with) - OBO

* The GO consortium

In [None]:
pro_go = pd.concat([pro_go_dict['pro_go_NOT|colocalizes_with'],
                    pro_go_dict['pro_go_NOT|colocalizes_with'].rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})])

pro_go = pro_go.groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'not_colocalizes_with'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_not_colocalizes_with_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0002264?NOT (not acts upstream of or within) - OBO

* The GO consortium

In [None]:
pro_go = pro_go_dict['pro_go_NOT|acts_upstream_of_or_within']

pro_go = pro_go.groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'not_acts_upstream_of_or_within'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_not_acts_upstream_of_or_within_OBO.pkl')
pro_go.head(n=3)

***
### OBO - http://purl.obolibrary.org/obo/RO_0004033?NOT (not acts upstream of or within negative effect) - OBO

* The GO consortium

In [None]:
pro_go = pro_go_dict['pro_go_NOT|acts_upstream_of_or_within_negative_effect']

pro_go = pro_go.groupby([':START_ID',':END_ID']).agg({'Source':set,'GO_evidence':set}).reset_index()
pro_go[':TYPE'] = 'not_acts_upstream_of_or_within_negative_effect'
pro_go.to_pickle(unprocessed_edge_data_location+'OBO_not_acts_upstream_of_or_within_negative_effect_OBO.pkl')
pro_go.head()

***
### COSMIC - http://purl.obolibrary.org/obo/RO_0002566 (causally influences) - OBO

* [LncBook](https://ngdc.cncb.ac.cn/lncbook/)

In [None]:
!wget https://ngdc.cncb.ac.cn/lncbook/files/variation_LncBook2.0.csv.gz -O ../resources/processed_data/unprocessed_data/variation_LncBook2.0.csv.gz

In [None]:
lncRNA_disease2 = pd.read_csv(unprocessed_data_location+'variation_LncBook2.0.csv.gz').drop(
    columns=['Symbol','ClinVar Allele ID','ClinVar Variation Effect','ClinVar Disease Name','Variant Name','dbSNP ID']) # Mondo+HPO
print(lncRNA_disease2['COSMIC Variation Effect'].unique())
lncRNA_disease2 = lncRNA_disease2[lncRNA_disease2['COSMIC Variation Effect'] == 'Pathogenic'].drop(columns=['COSMIC Variation Effect'])
lncRNA_disease2 = lncRNA_disease2[lncRNA_disease2['COSMIC Tumor Name'] != '-']
lncRNA_disease2['COSMIC Tumor Name'] = lncRNA_disease2['COSMIC Tumor Name'].str.split(';')
lncRNA_disease2 = lncRNA_disease2.explode('COSMIC Tumor Name')
lncRNA_disease2['COSMIC Tumor Name'] = lncRNA_disease2['COSMIC Tumor Name'].str.replace(r"\(.*?\)", "", regex=True)
lncRNA_disease2['COSMIC Tumor Name'] = [desc[1:] if desc.startswith(' ') else
                                       desc for desc in lncRNA_disease2['COSMIC Tumor Name']]
lncRNA_disease2['COSMIC Tumor Name'] = lncRNA_disease2['COSMIC Tumor Name'].str.lower()

lncRNA_disease2 = pd.merge(lncRNA_disease2, rnacentral_map_human_lncbook[['RNAcentral ID','LncBook Gene ID']].drop_duplicates().rename(
    columns={'LncBook Gene ID':'Gene ID'}), on='Gene ID', how='left')
lncRNA_disease2['RNAcentral ID'] = lncRNA_disease2['RNAcentral ID'].fillna(lncRNA_disease2['Gene ID'])

lncRNA_disease2 = pd.merge(lncRNA_disease2, desc_disPhe_map, right_on=0, left_on='COSMIC Tumor Name')

lncRNA_disease2['Source'] = 'LncBook'
lncRNA_disease2.rename(columns={1:':END_ID','Gene ID':'Interactor','COSMIC Mutation ID':':START_ID'}, inplace=True) 
lncRNA_disease2.head(n=3)

* SomamiR

In [None]:
miRNA_lncRNA2 = pd.read_csv(unprocessed_data_location+'lncRNA_somatic_v2.0.txt.tar.gz',sep='\t',dtype={'PMID':str})
miRNA_lncRNA2.drop(columns=['Unnamed: 18'],inplace=True) # Mondo+HPO
miRNA_lncRNA2.rename(columns={'lncRNA_somatic_v2.0.txt':'Gene'},inplace=True)
miRNA_lncRNA2['Gene'] = miRNA_lncRNA2['Gene'].str.replace(r'lnc-', '')
miRNA_lncRNA2.rename(columns={'Gene': 'symbol', 'miRNA':'mir_id'}, inplace=True)
miRNA_lncRNA2 = miRNA_lncRNA2[miRNA_lncRNA2['TargetScan_Site(0=No;1=Yes)'] == 1]

lncpedia_map = pd.read_csv("https://lncipedia.org/downloads/lncipedia_5_0/lncipedia_5_0_vs_5_2.txt", sep='\t')
miRNA_lncRNA2 = miRNA_lncRNA2.merge(lncpedia_map.rename(columns={'LNCipedia 5.0 Transcript ID':'Transcript'}),
                                    on='Transcript').drop(columns=['Transcript']).rename(
                                        columns={'LNCipedia 5.2 Transcript ID':'Transcript'})
miRNA_lncRNA2 = pd.merge(rnacentral_map_human_lncipedia.rename(columns={'LNCipedia transcript ID':'Transcript'}),
                         miRNA_lncRNA2, on='Transcript', how='left')

miRNA_lncRNA2['RNAcentral ID'] = miRNA_lncRNA2['RNAcentral ID'].fillna(miRNA_lncRNA2['Transcript'])

miRNA_lncRNA2 = miRNA_lncRNA2.drop(columns=['symbol','Chromosome','Location','Ref_Allele','Sample_Name',
                                            'Mut_Allele','FuncClass','Alteration','Target_Site',
                                            'Seed','SeedClass', 'TargetScan_Site(0=No;1=Yes)','Mut_ID'])

miRNA_lncRNA2 = miRNA_lncRNA2[miRNA_lncRNA2['Cancer_Class'].notna()]
miRNA_lncRNA2['Cancer_Class'] = miRNA_lncRNA2['Cancer_Class'].str.lower()
miRNA_lncRNA2['Cancer_Class'].str.replace('[ns]','[cancer]')
miRNA_lncRNA2['Cancer_Class'] = miRNA_lncRNA2['Cancer_Class'].apply(
    lambda x: ' '.join([x.split('[')[1].split(']')[0], x.split('[')[-1].split(']')[0]])).str.replace('_', ' ')

miRNA_lncRNA2['PMID'] = pd.to_numeric(miRNA_lncRNA2['PMID'], errors='coerce')
miRNA_lncRNA2['PMID'] = miRNA_lncRNA2['PMID'].astype(str)
miRNA_lncRNA2['PMID'] = miRNA_lncRNA2['PMID'].str.replace(".0", "")
miRNA_lncRNA2['PMID'] = miRNA_lncRNA2['PMID'].replace("<NA>", np.nan)
miRNA_lncRNA2['PMID'] = miRNA_lncRNA2['PMID'].replace("nan", np.nan)

miRNA_lncRNA2 = pd.merge(miRNA_lncRNA2, desc_disPhe_map, right_on=0, left_on='Cancer_Class')
miRNA_lncRNA2 = pd.concat([miRNA_lncRNA2.rename(columns={'mir_id':'Interactor'}), miRNA_lncRNA2.rename(columns={'RNAcentral ID':'Interactor'})])

miRNA_lncRNA2['Source'] = 'SomamiR'
miRNA_lncRNA3 = miRNA_lncRNA2.copy()
miRNA_lncRNA3['Source'] = 'TargetScan'
miRNA_lncRNA2 = pd.concat([miRNA_lncRNA2, miRNA_lncRNA3])
miRNA_lncRNA2.rename(columns={'COSMIC_ID':':START_ID',1:':END_ID','PMID':'PubMedID'}, inplace=True)
miRNA_lncRNA2.head(n=3)

In [None]:
data_downloader('https://compbio.uthsc.edu/SomamiR/download/miRNA_somatic_v2.0.txt.tar.gz', unprocessed_data_location)

In [None]:
somamir = pd.read_csv(unprocessed_data_location +
                      'miRNA_somatic_v2.0.txt.tar',sep='\t').drop(
                          columns=['Reference','Derived','SNP','Whole_Genome','Whole_Exome','Study_ID','Source',
                          'miRNA_Chromosome','Strand','Maturestart','Matureend','Mutation_Distance','Regioin',
                            'miR2GO_Execution_Sequence','Unnamed: 19','Sample_Name']) # Mondo+HPO
somamir.Cancer_Type = somamir.Cancer_Type.str.replace("[NS]","")
somamir = somamir[somamir.Cancer_Type!=""]
somamir['Cancer_Type'] = somamir['Cancer_Type'].apply(
    lambda x: ' '.join([x.split('[')[1].split(']')[0], x.split('[')[-1].split(']')[0]])).str.replace('_', ' ')

somamir = pd.merge(somamir, desc_disPhe_map, right_on=0, left_on='Cancer_Type')

somamir['Source'] = 'SomamiR'
somamir = somamir.rename(columns={'miRNA_Name':'Interactor',1:':END_ID','COSMIC_ID':':START_ID'})
somamir.head(n=3)

In [None]:
circRNA_miRNA = pd.read_csv(unprocessed_data_location + 'circRNA_somatic_v2.0.txt.tar.gz', sep="\t")
circRNA_miRNA = circRNA_miRNA[circRNA_miRNA['TargetScan_Site(0=No;1=Yes)'] == 1]
circRNA_miRNA = circRNA_miRNA.drop(columns=['Gene','Mut_ID','Chromosome','Location','Mut_Allele','FuncClass','Alteration','Ref_Allele',
                                            'Target_Site','Seed','SeedClass','TargetScan_Site(0=No;1=Yes)','Sample_Name','Unnamed: 18'])

circRNA_miRNA['PMID'] = pd.to_numeric(circRNA_miRNA['PMID'], errors='coerce')
circRNA_miRNA['PMID'] = circRNA_miRNA['PMID'].astype(str)
circRNA_miRNA['PMID'] = circRNA_miRNA['PMID'].str.replace(".0", "")
circRNA_miRNA['PMID'] = circRNA_miRNA['PMID'].replace("<NA>", np.nan)
circRNA_miRNA['PMID'] = circRNA_miRNA['PMID'].replace("nan", np.nan)

circRNA_miRNA['Cancer_Class'] = circRNA_miRNA['Cancer_Class'].str.lower()
circRNA_miRNA['Cancer_Class'] = circRNA_miRNA['Cancer_Class'].str.replace('[ns]','')
circRNA_miRNA['Cancer_Class'] = circRNA_miRNA['Cancer_Class'].str.replace('_',' ')
circRNA_miRNA['Cancer_Class'] = circRNA_miRNA['Cancer_Class'].astype(str).apply(
     lambda x: ' '.join(re.findall(r'\[(.*?)\]', x)) if '[' in x and ']' in x else x)
circRNA_miRNA = circRNA_miRNA.explode('Cancer_Class')

circRNA_miRNA = pd.merge(circRNA_miRNA, desc_disPhe_map, right_on=0, left_on='Cancer_Class')
circRNA_miRNA = pd.concat([circRNA_miRNA.rename(columns={'Transcript':'Interactor'}), circRNA_miRNA.rename(columns={'miRNA':'Interactor'})])

circRNA_miRNA['Source'] = 'SomamiR'
circRNA_miRNA2 = circRNA_miRNA.copy()
circRNA_miRNA2['Source'] = 'TargetScan'
circRNA_miRNA = pd.concat([circRNA_miRNA, circRNA_miRNA2])
RNA_RNA8 = circRNA_miRNA.rename(columns={1:':END_ID','COSMIC_ID':':START_ID', 'PMID':'PubMedID'}).drop_duplicates()
RNA_RNA8.head(n=3)

In [None]:
cosmic_causally_influences_OBO = pd.concat([lncRNA_disease2, miRNA_lncRNA2, somamir, RNA_RNA8])
cosmic_causally_influences_OBO = cosmic_causally_influences_OBO.groupby([':START_ID',':END_ID']).agg({'Source':set,
                                                                                                      'Interactor':set,
                                                                                                      'PubMedID':set}).reset_index()
cosmic_causally_influences_OBO[':TYPE'] = 'causally_influences'
cosmic_causally_influences_OBO.to_pickle(unprocessed_edge_data_location+'COSMIC_causally_influences_OBO.pkl')

OBO_causally_influenced_by_COSMIC = cosmic_causally_influences_OBO.rename(columns={':START_ID':':END_ID',':END_ID':':START_ID'})
OBO_causally_influenced_by_COSMIC[':TYPE'] = 'causally_influenced_by'
OBO_causally_influenced_by_COSMIC.to_pickle(unprocessed_edge_data_location+'OBO_causally_influenced_by_COSMIC.pkl')
OBO_causally_influenced_by_COSMIC.head(n=3)


<br>

***
***

```
@misc{cavalleri_e_2024_rna_kg,
  author       = {Cavalleri, E},
  title        = {RNA-KG},
  year         = 2024,
  doi          = {10.5281/zenodo.10078876},
  url          = {https://doi.org/10.5281/zenodo.10078876}
}
```