# <p style="text-align: center;">Inter-RNA Knowledge Graph Build Data Preparation</p>
    
***
***

**Authors:** [ECavalleri](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=emanuele.cavalleri@studenti.unimi.it), [TJCallahan](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=callahantiff@gmail.com), [MMesiti](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=marco.mesiti@unimi.it), [GValentini](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=giorgio.valentini@unimi.it)

**GitHub Repositories:** [RNA-KG](https://github.com/AnacletoLAB/RNA-KG/), [PheKnowLator](https://github.com/callahantiff/PheKnowLator/)  
<!--- **Release:** **[v2.0.0](https://github.com/callahantiff/PheKnowLator/wiki/v2.0.0)** --->
  
<br>  
  
**Purpose:** This notebook serves as a script to download, process, map, and clean data in order to build edges for inteRNA-KG. For more information on the data sources utilize within this script, please see the [PheKnowLator Data Sources](https://github.com/callahantiff/PheKnowLator/wiki/v2-Data-Sources) Wiki page.

<br>

**Assumptions:**   
- Edge data downloads ➞ `./resources/edge_data`  
- Ontologies ➞ `./resources/ontologies`    
- Processed data write location ➞ `./resources/processed_data`  

<br>

**Dependencies:**   
- **Scripts**: This notebook utilizes several helper functions, which are stored in the [`data_utils.py`](https://github.com/callahantiff/PheKnowLator/blob/master/pkt_kg/utils/data_utils.py) and [`kg_utils.py`](https://github.com/callahantiff/PheKnowLator/blob/master/pkt_kg/utils/kg_utils.py) scripts. 
_____
***

## Set-Up Environment
_____

In [None]:
%%capture
import sys
!{sys.executable} -m pip install -r requirements.txt
sys.path.append('../')

In [None]:
# import needed libraries
import datetime
import glob
import itertools
import networkx
import numpy
import os
import pickle
import re
import requests
import tarfile
import shutil
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import re

from collections import Counter
from functools import reduce
from rdflib import Graph, Namespace, URIRef, BNode, Literal
from rdflib.namespace import OWL, RDF, RDFS
from reactome2py import content
from tqdm import tqdm
from typing import Dict

from pkt_kg.utils import * 
from builds.ontology_cleaning import *

from typing import Tuple

#### Define Global Variables

In [None]:
# directory to store resources
resource_data_location = '../resources/'

# directory to use for unprocessed data
unprocessed_data_location = '../resources/processed_data/unprocessed_data/'

# directory to use for processed data
processed_data_location = '../resources/processed_data/'

# directory to write ontology data to
ontology_data_location = '../resources/ontologies/'

# directory to write edges data to
edge_data_location = '../resources/edge_data/'

# processed data url 
processed_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/processed_data/'

# original data url 
original_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/original_data/'

# owltools location
owltools_location = '../pkt_kg/libs/owltools'

In [None]:
doid_mondo_map = pd.read_csv(processed_data_location+'DOID_MONDO_MAP.txt', header=None, delimiter='\t')
doid_mondo_map

In [None]:
entrez_pro_map = pd.read_csv(processed_data_location+'ENTREZ_GENE_PRO_ONTOLOGY_MAP.txt', header=None, delimiter='\t')
entrez_pro_map = entrez_pro_map[[0,1]]
entrez_pro_map

In [None]:
desc_chebi_map = pd.read_csv(unprocessed_data_location+'DESC_CHEBI_MAP.txt', header=None, delimiter='\t')
desc_chebi_map

In [None]:
desc_go_map = pd.read_csv(unprocessed_data_location+'DESC_GO_MAP.txt', header=None, delimiter='\t')
desc_go_map

In [None]:
desc_pw_map = pd.read_csv(unprocessed_data_location+'DESC_PW_MAP.txt', header=None, delimiter='\t')
desc_pw_map

In [None]:
syn_pw_map = pd.read_csv(unprocessed_data_location+'SYN_PW_MAP.txt', header=None, delimiter='\t')
desc_pw_map = syn_pw_map.append(desc_pw_map)
desc_pw_map

In [None]:
mirna_mirbase_map = pd.read_csv(processed_data_location + 'MIRNA_MIRBASE_MAP.txt', header=None, sep='\t')
mirna_mirbase_map

In [None]:
symbol_to_pro = pd.read_csv(processed_data_location + 'GENE_SYMBOL_PRO_ONTOLOGY_MAP.txt', header=None, sep='\t')
symbol_to_pro

In [None]:
desc_pro_map = pd.read_csv(unprocessed_data_location+'DESC_PRO_MAP.txt', header=None, delimiter='\t', dtype=object)
desc_pro_map[0] = desc_pro_map[0].str.replace("human", '')
desc_pro_map[0] = desc_pro_map[0].str.replace("(", '')
desc_pro_map[0] = desc_pro_map[0].str.replace(")", '')
desc_pro_map[0] = desc_pro_map[0].str.replace(",(.*)", '')
desc_pro_map

In [None]:
symbol_entrez_map = pd.read_csv(processed_data_location+'GENE_SYMBOL_ENTREZ_ID_MAP.txt',header=None, sep='\t')
symbol_entrez_map

In [None]:
desc_bto_map = pd.read_csv(unprocessed_data_location+'DESC_BTO_MAP.txt', header=None, delimiter='\t', dtype=object)
desc_bto_map[0] = desc_bto_map[0].str.replace("human", '')
desc_bto_map[0] = desc_bto_map[0].str.replace("(", '')
desc_bto_map[0] = desc_bto_map[0].str.replace(")", '')
desc_bto_map[0] = desc_bto_map[0].str.replace(",(.*)", '')
desc_bto_map

***
***
## DOWNLOAD AND PROCESS EDGE DATASETS  <a class="anchor" id="create-edges"></a>
***
***

### [RNAInter](http://www.rnainter.org/)
RNAInter integrates experimentally validated and computationally predicted RNA interactome data from the literature and databases.

* #### RNA-RNA

In [None]:
# http://www.rnainter.org/raidMedia/download/Download_data_RR.tar.gz
RNA_RNA = pd.read_csv(unprocessed_data_location+'Download_data_RR.txt',sep='\t')
# We select only strong evidence interactions for hsa
RNA_RNA = RNA_RNA[(RNA_RNA['score'] >= 0.2886) &
                  (RNA_RNA['Species1'].str.contains('apiens')) &
                  (RNA_RNA['Species2'].str.contains('apiens'))]

# We keep only entries starting with NCBI, miRBase, tRFdb, HG19_TRNAS_
RNA_RNA = RNA_RNA[(RNA_RNA['Raw_ID1'].str.startswith('NCBI')) |
                  (RNA_RNA['Raw_ID1'].str.startswith('miRBase')) |
                  (RNA_RNA['Raw_ID1'].str.startswith('tRFdb')) |
                  (RNA_RNA['Raw_ID1'].str.startswith('HG19_TRNAS_')) |
                  (RNA_RNA['Raw_ID2'].str.startswith('NCBI')) |
                  (RNA_RNA['Raw_ID2'].str.startswith('miRBase')) |
                  (RNA_RNA['Raw_ID2'].str.startswith('tRFdb')) |
                  (RNA_RNA['Raw_ID2'].str.startswith('HG19_TRNAS_'))
                 ]

# PCG, circRNA, lncRNA, mRNA, ncRNA, others (RNA gene), pseudo, rRNA,
# ribozyme, scRNA, scaRNA, snRNA, snoRNA, unknown (RNA), tRNA
RNA_RNA.Raw_ID1 = RNA_RNA.Raw_ID1.str.replace("NCBI:", '')
RNA_RNA.Raw_ID2 = RNA_RNA.Raw_ID2.str.replace("NCBI:", '')

# miRNA
RNA_RNA.Raw_ID1 = RNA_RNA.Raw_ID1.str.replace("miRBase:", '')
RNA_RNA.Raw_ID2 = RNA_RNA.Raw_ID2.str.replace("miRBase:", '')

# tRF
RNA_RNA.Raw_ID1 = RNA_RNA.Raw_ID1.str.replace("tRFdb:", '')
RNA_RNA.Raw_ID2 = RNA_RNA.Raw_ID2.str.replace("tRFdb:", '')

# tRNA
RNA_RNA['Interactor1.Symbol'] = RNA_RNA['Interactor1.Symbol'].str.replace("HG19_TRNAS_", '')

RNA_RNA['Raw_ID1'] = RNA_RNA['Raw_ID1'].str.split(';')
RNA_RNA = RNA_RNA.explode('Raw_ID1')
RNA_RNA['Raw_ID2'] = RNA_RNA['Raw_ID2'].str.split(';')
RNA_RNA = RNA_RNA.explode('Raw_ID2')

RNA_RNA

In [None]:
i = RNA_RNA[(RNA_RNA['Category1']=='circRNA') |
            (RNA_RNA['Category1']=='lncRNA') |
            (RNA_RNA['Category1']=='mRNA') |
            (RNA_RNA['Category1']=='ncRNA') |
            (RNA_RNA['Category1']=='others') |
            (RNA_RNA['Category1']=='rRNA') |
            (RNA_RNA['Category1']=='ribozyme') |
            (RNA_RNA['Category1']=='pseudo') |
            (RNA_RNA['Category1']=='scRNA') |
            (RNA_RNA['Category1']=='scaRNA') |
            (RNA_RNA['Category1']=='snRNA') |
            (RNA_RNA['Category1']=='snoRNA') |
            (RNA_RNA['Category1']=='unknown') |
            (RNA_RNA['Category1']=='tRNA')].index.values
RNA_RNA.loc[i,"Raw_ID1"]=RNA_RNA.loc[i,"Raw_ID1"] + '#' + RNA_RNA.loc[i,"Category1"]

i = RNA_RNA[(RNA_RNA['Category2']=='circRNA') |
            (RNA_RNA['Category2']=='lncRNA') |
            (RNA_RNA['Category2']=='mRNA') |
            (RNA_RNA['Category2']=='pseudo') |
            (RNA_RNA['Category2']=='ncRNA') |
            (RNA_RNA['Category2']=='others') |
            (RNA_RNA['Category2']=='rRNA') |
            (RNA_RNA['Category2']=='ribozyme') |
            (RNA_RNA['Category2']=='scRNA') |
            (RNA_RNA['Category2']=='scaRNA') |
            (RNA_RNA['Category2']=='snRNA') |
            (RNA_RNA['Category2']=='snoRNA') |
            (RNA_RNA['Category2']=='unknown') |
            (RNA_RNA['Category2']=='tRNA')].index.values
RNA_RNA.loc[i,"Raw_ID2"]=RNA_RNA.loc[i,"Raw_ID2"] + '#' + RNA_RNA.loc[i,"Category2"]

#RNA_RNA.drop(columns=['RNAInterID','Interactor1.Symbol','Species1','Interactor2.Symbol','Species2'],inplace=True)
RNA_RNA

In [None]:
a = set(RNA_RNA.Category1)
b = set(RNA_RNA.Category2)
print(a.union(b))

* ##### miRNA-RNA

In [None]:
miRNA_RNA = RNA_RNA[(RNA_RNA['Category1']=='miRNA')]
RNA_miRNA = RNA_RNA[(RNA_RNA['Category2']=='miRNA')]
RNA_miRNA.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                          'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                 inplace=True)
miRNA_RNA = miRNA_RNA.append(RNA_miRNA)

In [None]:
miRNA_RNA['Category2'].unique()

In [None]:
miRNA_mRNA = miRNA_RNA[(miRNA_RNA['Category1']=='miRNA') & (miRNA_RNA['Category2']=='mRNA')]
miRNA_mRNA = miRNA_mRNA[(miRNA_mRNA['Raw_ID1'].notna())]
miRNA_mRNA = miRNA_mRNA[(miRNA_mRNA['Raw_ID2'].notna())]

maturemiRNA_mRNA = miRNA_mRNA[(miRNA_mRNA['Raw_ID1'].str.startswith('MIMAT')) &
                              (miRNA_mRNA['Raw_ID2'].str[0].str.isdigit())]
premiRNA_mRNA = miRNA_mRNA[(miRNA_mRNA['Raw_ID1'].str.startswith('MI')) &
                           (~miRNA_mRNA['Raw_ID1'].str.startswith('MIMAT')) &
                           (miRNA_mRNA['Raw_ID2'].str[0].str.isdigit())]

maturemiRNA_mRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-mRNA.txt', header=None, sep='\t', index=None)
premiRNA_mRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'premiRNA-mRNA.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_lncRNA = miRNA_RNA[(miRNA_RNA['Category1']=='miRNA') & (miRNA_RNA['Category2']=='lncRNA')]
miRNA_lncRNA = miRNA_lncRNA[(miRNA_lncRNA['Raw_ID1'].notna())]
miRNA_lncRNA = miRNA_lncRNA[(miRNA_lncRNA['Raw_ID2'].notna())]

maturemiRNA_lncRNA = miRNA_lncRNA[(miRNA_lncRNA['Raw_ID1'].str.startswith('MIMAT')) &
                                  (miRNA_lncRNA['Raw_ID2'].str[0].str.isdigit())]
premiRNA_lncRNA = miRNA_lncRNA[(miRNA_lncRNA['Raw_ID1'].str.startswith('MI')) &
                               (~miRNA_lncRNA['Raw_ID1'].str.startswith('MIMAT')) &
                               (miRNA_lncRNA['Raw_ID2'].str[0].str.isdigit())]

maturemiRNA_lncRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-lncRNA.txt', header=None, sep='\t', index=None)
premiRNA_lncRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'premiRNA-lncRNA.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_circRNA = miRNA_RNA[(miRNA_RNA['Category1']=='miRNA') & (miRNA_RNA['Category2']=='circRNA')]
miRNA_circRNA = miRNA_circRNA[(miRNA_circRNA['Raw_ID1'].notna())]
miRNA_circRNA = miRNA_circRNA[(miRNA_circRNA['Raw_ID2'].notna())]

maturemiRNA_circRNA = miRNA_circRNA[(miRNA_circRNA['Raw_ID1'].str.startswith('MIMAT')) &
                                  (miRNA_circRNA['Raw_ID2'].str[0].str.isdigit())]

maturemiRNA_circRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-circRNA.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_othersRNA = miRNA_RNA[(miRNA_RNA['Category1']=='miRNA') & (miRNA_RNA['Category2']=='others')]
miRNA_othersRNA = miRNA_othersRNA[(miRNA_othersRNA['Raw_ID1'].notna())]
miRNA_othersRNA = miRNA_othersRNA[(miRNA_othersRNA['Raw_ID2'].notna())]

maturemiRNA_othersRNA = miRNA_othersRNA[(miRNA_othersRNA['Raw_ID1'].str.startswith('MIMAT')) &
                                  (miRNA_othersRNA['Raw_ID2'].str[0].str.isdigit())]

maturemiRNA_othersRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-othersRNA.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_pseudo = miRNA_RNA[(miRNA_RNA['Category1']=='miRNA') & (miRNA_RNA['Category2']=='pseudo')]
miRNA_pseudo = miRNA_pseudo[(miRNA_pseudo['Raw_ID1'].notna())]
miRNA_pseudo = miRNA_pseudo[(miRNA_pseudo['Raw_ID2'].notna())]

maturemiRNA_pseudo = miRNA_pseudo[(miRNA_pseudo['Raw_ID1'].str.startswith('MIMAT')) &
                                  (miRNA_pseudo['Raw_ID2'].str[0].str.isdigit())]
premiRNA_pseudo = miRNA_pseudo[(miRNA_pseudo['Raw_ID1'].str.startswith('MI')) &
                               (~miRNA_pseudo['Raw_ID1'].str.startswith('MIMAT')) &
                               (miRNA_pseudo['Raw_ID2'].str[0].str.isdigit())]

maturemiRNA_pseudo[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-pseudogene.txt', header=None, sep='\t', index=None)
premiRNA_pseudo[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'premiRNA-pseudogene.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_protein = miRNA_RNA[(miRNA_RNA['Category1']=='miRNA') & ((miRNA_RNA['Category2']=='protein') |
                                                              (miRNA_RNA['Category2']=='Protein'))]
miRNA_protein = miRNA_protein[(miRNA_protein['Raw_ID1'].notna())]
miRNA_protein = miRNA_protein[(miRNA_protein['Raw_ID2'].notna())]
miRNA_protein['Raw_ID2'] = miRNA_protein['Raw_ID2'].astype(str).astype(int)

miRNA_protein = pd.merge(miRNA_protein, entrez_pro_map.rename(columns={0: 'Raw_ID2'}), on='Raw_ID2')
miRNA_protein.drop(columns=['Raw_ID2'], inplace=True)

maturemiRNA_protein = miRNA_protein[(miRNA_protein['Raw_ID1'].str.startswith('MIMAT'))]
premiRNA_protein = miRNA_protein[(miRNA_protein['Raw_ID1'].str.startswith('MI')) &
                               (~miRNA_protein['Raw_ID1'].str.startswith('MIMAT'))]

maturemiRNA_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-protein.txt', header=None, sep='\t', index=None)
premiRNA_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'premiRNA-protein.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_miRNA = miRNA_RNA[(miRNA_RNA['Category1']=='miRNA') & (miRNA_RNA['Category2']=='miRNA')]
miRNA_miRNA = miRNA_miRNA[(miRNA_miRNA['Raw_ID1'].notna())]
miRNA_miRNA = miRNA_miRNA[(miRNA_miRNA['Raw_ID2'].notna())]

maturemiRNA_maturemiRNA = miRNA_miRNA[(miRNA_miRNA['Raw_ID1'].str.startswith('MIMAT')) &
                                      (miRNA_miRNA['Raw_ID2'].str.startswith('MIMAT'))]
# Other combinations are empty

maturemiRNA_maturemiRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-miRNA.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_snoRNA = miRNA_RNA[(miRNA_RNA['Category1']=='miRNA') & (miRNA_RNA['Category2']=='snoRNA')]
miRNA_snoRNA = miRNA_snoRNA[(miRNA_snoRNA['Raw_ID1'].notna())]
miRNA_snoRNA = miRNA_snoRNA[(miRNA_snoRNA['Raw_ID2'].notna())]

maturemiRNA_snoRNA = miRNA_snoRNA[(miRNA_snoRNA['Raw_ID1'].str.startswith('MIMAT')) &
                                  (miRNA_snoRNA['Raw_ID2'].str[0].str.isdigit())]

maturemiRNA_snoRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-snoRNA.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_snRNA = miRNA_RNA[(miRNA_RNA['Category1']=='miRNA') & (miRNA_RNA['Category2']=='snRNA')]
miRNA_snRNA = miRNA_snRNA[(miRNA_snRNA['Raw_ID1'].notna())]
miRNA_snRNA = miRNA_snRNA[(miRNA_snRNA['Raw_ID2'].notna())]

maturemiRNA_snRNA = miRNA_snRNA[(miRNA_snRNA['Raw_ID1'].str.startswith('MIMAT')) &
                                  (miRNA_snRNA['Raw_ID2'].str[0].str.isdigit())]

maturemiRNA_snRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-snRNA.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_scaRNA = miRNA_RNA[(miRNA_RNA['Category1']=='miRNA') & (miRNA_RNA['Category2']=='scaRNA')]
miRNA_scaRNA = miRNA_scaRNA[(miRNA_scaRNA['Raw_ID1'].notna())]
miRNA_scaRNA = miRNA_scaRNA[(miRNA_scaRNA['Raw_ID2'].notna())]

maturemiRNA_scaRNA = miRNA_scaRNA[(miRNA_scaRNA['Raw_ID1'].str.startswith('MIMAT')) &
                                  (miRNA_scaRNA['Raw_ID2'].str[0].str.isdigit())]

maturemiRNA_scaRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-scaRNA.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_ncRNA = miRNA_RNA[(miRNA_RNA['Category1']=='miRNA') & (miRNA_RNA['Category2']=='ncRNA')]
miRNA_ncRNA = miRNA_ncRNA[(miRNA_ncRNA['Raw_ID1'].notna())]
miRNA_ncRNA = miRNA_ncRNA[(miRNA_ncRNA['Raw_ID2'].notna())]

maturemiRNA_ncRNA = miRNA_ncRNA[(miRNA_ncRNA['Raw_ID1'].str.startswith('MIMAT')) &
                                  (miRNA_ncRNA['Raw_ID2'].str[0].str.isdigit())]

maturemiRNA_ncRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-ncRNA.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_ribozyme = miRNA_RNA[(miRNA_RNA['Category1']=='miRNA') & (miRNA_RNA['Category2']=='ribozyme')]
miRNA_ribozyme = miRNA_ribozyme[(miRNA_ribozyme['Raw_ID1'].notna())]
miRNA_ribozyme = miRNA_ribozyme[(miRNA_ribozyme['Raw_ID2'].notna())]

maturemiRNA_ribozyme = miRNA_ribozyme[(miRNA_ribozyme['Raw_ID1'].str.startswith('MIMAT')) &
                                  (miRNA_ribozyme['Raw_ID2'].str[0].str.isdigit())]

maturemiRNA_ribozyme[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-ribozyme.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_unknownRNA = miRNA_RNA[(miRNA_RNA['Category1']=='miRNA') & (miRNA_RNA['Category2']=='unknown')]
miRNA_unknownRNA = miRNA_unknownRNA[(miRNA_unknownRNA['Raw_ID1'].notna())]
miRNA_unknownRNA = miRNA_unknownRNA[(miRNA_unknownRNA['Raw_ID2'].notna())]

maturemiRNA_unknownRNA = miRNA_unknownRNA[(miRNA_unknownRNA['Raw_ID1'].str.startswith('MIMAT')) &
                                  (miRNA_unknownRNA['Raw_ID2'].str[0].str.isdigit())]

maturemiRNA_unknownRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-unknownRNA.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_scRNA = miRNA_RNA[(miRNA_RNA['Category1']=='miRNA') & (miRNA_RNA['Category2']=='scRNA')]
miRNA_scRNA = miRNA_scRNA[(miRNA_scRNA['Raw_ID1'].notna())]
miRNA_scRNA = miRNA_scRNA[(miRNA_scRNA['Raw_ID2'].notna())]

maturemiRNA_scRNA = miRNA_scRNA[(miRNA_scRNA['Raw_ID1'].str.startswith('MIMAT')) &
                                  (miRNA_scRNA['Raw_ID2'].str[0].str.isdigit())]

maturemiRNA_scRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-scRNA.txt', header=None, sep='\t', index=None)

* ##### othersRNA-RNA

In [None]:
othersRNA_RNA = RNA_RNA[(RNA_RNA['Category1']=='others')]
RNA_othersRNA = RNA_RNA[(RNA_RNA['Category2']=='others')]
RNA_othersRNA.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                          'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                 inplace=True)
othersRNA_RNA = othersRNA_RNA.append(RNA_othersRNA)

In [None]:
othersRNA_RNA.Category2.unique()

In [None]:
othersRNA_mRNA = othersRNA_RNA[othersRNA_RNA['Category2']=='mRNA']
othersRNA_mRNA = othersRNA_mRNA[(othersRNA_mRNA['Raw_ID1'].notna())]
othersRNA_mRNA = othersRNA_mRNA[(othersRNA_mRNA['Raw_ID2'].notna())]

othersRNA_mRNA = othersRNA_mRNA[(othersRNA_mRNA['Raw_ID1'].str[0].str.isdigit()) &
                                (othersRNA_mRNA['Raw_ID2'].str[0].str.isdigit())]

othersRNA_mRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'othersRNA-mRNA.txt', header=None, sep='\t', index=None)

In [None]:
othersRNA_lncRNA = othersRNA_RNA[othersRNA_RNA['Category2']=='lncRNA']
othersRNA_lncRNA = othersRNA_lncRNA[(othersRNA_lncRNA['Raw_ID1'].notna())]
othersRNA_lncRNA = othersRNA_lncRNA[(othersRNA_lncRNA['Raw_ID2'].notna())]

othersRNA_lncRNA = othersRNA_lncRNA[(othersRNA_lncRNA['Raw_ID1'].str[0].str.isdigit()) &
                                    (othersRNA_lncRNA['Raw_ID2'].str[0].str.isdigit())]

othersRNA_lncRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'othersRNA-lncRNA.txt', header=None, sep='\t', index=None)

In [None]:
othersRNA_pseudo = othersRNA_RNA[othersRNA_RNA['Category2']=='pseudo']
othersRNA_pseudo = othersRNA_pseudo[(othersRNA_pseudo['Raw_ID1'].notna())]
othersRNA_pseudo = othersRNA_pseudo[(othersRNA_pseudo['Raw_ID2'].notna())]

othersRNA_pseudo = othersRNA_pseudo[(othersRNA_pseudo['Raw_ID1'].str[0].str.isdigit()) &
                                    (othersRNA_pseudo['Raw_ID2'].str[0].str.isdigit())]

othersRNA_pseudo[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'othersRNA-pseudogene.txt', header=None, sep='\t', index=None)

In [None]:
othersRNA_rRNA = othersRNA_RNA[othersRNA_RNA['Category2']=='rRNA']
othersRNA_rRNA = othersRNA_rRNA[(othersRNA_rRNA['Raw_ID1'].notna())]
othersRNA_rRNA = othersRNA_rRNA[(othersRNA_rRNA['Raw_ID2'].notna())]

othersRNA_rRNA = othersRNA_rRNA[(othersRNA_rRNA['Raw_ID1'].str[0].str.isdigit()) &
                                (othersRNA_rRNA['Raw_ID2'].str[0].str.isdigit())]

othersRNA_rRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'othersRNA-rRNA.txt', header=None, sep='\t', index=None)

* ##### snRNA-RNA

In [None]:
snRNA_RNA = RNA_RNA[(RNA_RNA['Category1']=='snRNA')]
RNA_snRNA = RNA_RNA[(RNA_RNA['Category2']=='snRNA')]
RNA_snRNA.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                          'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                 inplace=True)
snRNA_RNA = snRNA_RNA.append(RNA_snRNA)
snRNA_RNA.Category2.unique()

In [None]:
snRNA_snRNA = snRNA_RNA[snRNA_RNA['Category2']=='snRNA']
snRNA_snRNA = snRNA_snRNA[(snRNA_snRNA['Raw_ID1'].notna())]
snRNA_snRNA = snRNA_snRNA[(snRNA_snRNA['Raw_ID2'].notna())]

snRNA_snRNA = snRNA_snRNA[(snRNA_snRNA['Raw_ID1'].str[0].str.isdigit()) &
                          (snRNA_snRNA['Raw_ID2'].str[0].str.isdigit())]

snRNA_snRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'snRNA-snRNA.txt', header=None, sep='\t', index=None)

In [None]:
snRNA_lncRNA = snRNA_RNA[snRNA_RNA['Category2']=='lncRNA']
snRNA_lncRNA = snRNA_lncRNA[(snRNA_lncRNA['Raw_ID1'].notna())]
snRNA_lncRNA = snRNA_lncRNA[(snRNA_lncRNA['Raw_ID2'].notna())]

snRNA_lncRNA = snRNA_lncRNA[(snRNA_lncRNA['Raw_ID1'].str[0].str.isdigit()) &
                          (snRNA_lncRNA['Raw_ID2'].str[0].str.isdigit())]

snRNA_lncRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'snRNA-lncRNA.txt', header=None, sep='\t', index=None)

In [None]:
snRNA_mRNA = snRNA_RNA[snRNA_RNA['Category2']=='mRNA']
snRNA_mRNA = snRNA_mRNA[(snRNA_mRNA['Raw_ID1'].notna())]
snRNA_mRNA = snRNA_mRNA[(snRNA_mRNA['Raw_ID2'].notna())]

snRNA_mRNA = snRNA_mRNA[(snRNA_mRNA['Raw_ID1'].str[0].str.isdigit()) &
                        (snRNA_mRNA['Raw_ID2'].str[0].str.isdigit())]

snRNA_mRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'snRNA-mRNA.txt', header=None, sep='\t', index=None)

In [None]:
snRNA_pseudo = snRNA_RNA[snRNA_RNA['Category2']=='pseudo']
snRNA_pseudo = snRNA_pseudo[(snRNA_pseudo['Raw_ID1'].notna())]
snRNA_pseudo = snRNA_pseudo[(snRNA_pseudo['Raw_ID2'].notna())]

snRNA_pseudo = snRNA_pseudo[(snRNA_pseudo['Raw_ID1'].str[0].str.isdigit()) &
                            (snRNA_pseudo['Raw_ID2'].str[0].str.isdigit())]

snRNA_pseudo[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'snRNA-pseudogene.txt', header=None, sep='\t', index=None)

In [None]:
snRNA_snoRNA = snRNA_RNA[snRNA_RNA['Category2']=='snoRNA']
snRNA_snoRNA = snRNA_snoRNA[(snRNA_snoRNA['Raw_ID1'].notna())]
snRNA_snoRNA = snRNA_snoRNA[(snRNA_snoRNA['Raw_ID2'].notna())]

snRNA_snoRNA = snRNA_snoRNA[(snRNA_snoRNA['Raw_ID1'].str[0].str.isdigit()) &
                            (snRNA_snoRNA['Raw_ID2'].str[0].str.isdigit())]

snRNA_snoRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'snRNA-snoRNA.txt', header=None, sep='\t', index=None)

* ##### PCG-RNA

In [None]:
PCG_RNA = RNA_RNA[(RNA_RNA['Category1']=='PCG')]
RNA_PCG = RNA_RNA[(RNA_RNA['Category2']=='PCG')]
RNA_PCG.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                        'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                 inplace=True)
PCG_RNA = PCG_RNA.append(RNA_PCG)
PCG_RNA.Category2.unique()

In [None]:
PCG_lncRNA = PCG_RNA[(PCG_RNA['Raw_ID1'].notna())]
PCG_lncRNA = PCG_RNA[(PCG_RNA['Raw_ID2'].notna())]

PCG_lncRNA = PCG_lncRNA[(PCG_lncRNA['Raw_ID1'].str[0].str.isdigit()) &
                        (PCG_lncRNA['Raw_ID2'].str[0].str.isdigit())]

PCG_lncRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'PCG-lncRNA.txt', header=None, sep='\t', index=None)

* ##### tRNA-RNA

In [None]:
tRNA_RNA = RNA_RNA[(RNA_RNA['Category1']=='tRNA')]
RNA_tRNA = RNA_RNA[(RNA_RNA['Category2']=='tRNA')]
RNA_tRNA.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                        'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                 inplace=True)
tRNA_RNA = tRNA_RNA.append(RNA_tRNA)
tRNA_RNA.Raw_ID1.fillna(tRNA_RNA['Interactor1.Symbol'], inplace=True)
tRNA_RNA.Category2.unique()

In [None]:
tRNA_mRNA = tRNA_RNA[tRNA_RNA['Category2']=='mRNA']
tRNA_mRNA_NCBI = tRNA_mRNA[(tRNA_mRNA['Raw_ID1'].str[0].str.isdigit())]
tRNA_mRNA_gtRNA = tRNA_mRNA[~(tRNA_mRNA['Raw_ID1'].str[0].str.isdigit())]

tRNA_mRNA_gtRNA['Raw_ID1'] = tRNA_mRNA_gtRNA['Raw_ID1'] + '.html'

tRNA_mRNA_NCBI[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'tRNA-mRNA_NCBI.txt', header=None, sep='\t', index=None)
tRNA_mRNA_gtRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'tRNA-mRNA_gtRNA.txt', header=None, sep='\t', index=None)

In [None]:
tRNA_lncRNA = tRNA_RNA[tRNA_RNA['Category2']=='lncRNA']

tRNA_lncRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'tRNA-lncRNA.txt', header=None, sep='\t', index=None)

* ##### piRNA-RNA

In [None]:
piRNA_RNA = RNA_RNA[(RNA_RNA['Category1']=='piRNA')]
RNA_piRNA = RNA_RNA[(RNA_RNA['Category2']=='piRNA')]
RNA_piRNA.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                        'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                 inplace=True)
piRNA_RNA = piRNA_RNA.append(RNA_piRNA)
piRNA_RNA.Raw_ID1.fillna(piRNA_RNA['Interactor1.Symbol'], inplace=True)
piRNA_RNA.Category2.unique()

In [None]:
piRNA_mRNA = piRNA_RNA[piRNA_RNA['Category2']=='mRNA']

piRNA_mRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'piRNA-mRNA.txt', header=None, sep='\t', index=None)

In [None]:
piRNA_lncRNA = piRNA_RNA[piRNA_RNA['Category2']=='lncRNA']

piRNA_lncRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'piRNA-lncRNA.txt', header=None, sep='\t', index=None)

* ##### tRF-RNA

In [None]:
tRF_RNA = RNA_RNA[(RNA_RNA['Category1']=='tRF')]
RNA_tRF = RNA_RNA[(RNA_RNA['Category2']=='tRF')]
RNA_tRF.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                        'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                 inplace=True)
tRF_RNA = tRF_RNA.append(RNA_tRF)
tRF_RNA.Category2.unique()

In [None]:
tRF_mRNA = tRF_RNA[tRF_RNA['Category2']=='mRNA']
tRF_mRNA = tRF_mRNA[(tRF_mRNA['Raw_ID1'].notna())]
tRF_mRNA = tRF_mRNA[(tRF_mRNA['Raw_ID2'].notna())]
tRF_mRNA = tRF_mRNA[(tRF_mRNA['Raw_ID2'].str[0].str.isdigit())]

tRF_mRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'tRF-mRNA.txt', header=None, sep='\t', index=None)

In [None]:
tRF_lncRNA = tRF_RNA[tRF_RNA['Category2']=='lncRNA']
tRF_lncRNA = tRF_lncRNA[(tRF_lncRNA['Raw_ID1'].notna())]
tRF_lncRNA = tRF_lncRNA[(tRF_lncRNA['Raw_ID2'].notna())]
tRF_lncRNA = tRF_lncRNA[(tRF_lncRNA['Raw_ID2'].str[0].str.isdigit())]

tRF_lncRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'tRF-lncRNA.txt', header=None, sep='\t', index=None)

In [None]:
tRF_pseudo = tRF_RNA[tRF_RNA['Category2']=='pseudo']
tRF_pseudo = tRF_pseudo[(tRF_pseudo['Raw_ID1'].notna())]
tRF_pseudo = tRF_pseudo[(tRF_pseudo['Raw_ID2'].notna())]
tRF_pseudo = tRF_pseudo[(tRF_pseudo['Raw_ID2'].str[0].str.isdigit())]

tRF_pseudo[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'tRF-pseudogene.txt', header=None, sep='\t', index=None)

* ##### snoRNA-RNA

In [None]:
snoRNA_RNA = RNA_RNA[(RNA_RNA['Category1']=='snoRNA')]
RNA_snoRNA = RNA_RNA[(RNA_RNA['Category2']=='snoRNA')]
RNA_snoRNA.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                           'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                  inplace=True)
snoRNA_RNA = snoRNA_RNA.append(RNA_snoRNA)
snoRNA_RNA.Category2.unique()

In [None]:
snoRNA_lncRNA = snoRNA_RNA[snoRNA_RNA['Category2']=='lncRNA']
snoRNA_lncRNA = snoRNA_lncRNA[(snoRNA_lncRNA['Raw_ID1'].notna())]
snoRNA_lncRNA = snoRNA_lncRNA[(snoRNA_lncRNA['Raw_ID2'].notna())]
snoRNA_lncRNA = snoRNA_lncRNA[(snoRNA_lncRNA['Raw_ID1'].str[0].str.isdigit()) &
                              (snoRNA_lncRNA['Raw_ID2'].str[0].str.isdigit())]

snoRNA_lncRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'snoRNA-lncRNA.txt', header=None, sep='\t', index=None)

In [None]:
snoRNA_mRNA = snoRNA_RNA[snoRNA_RNA['Category2']=='mRNA']
snoRNA_mRNA = snoRNA_mRNA[(snoRNA_mRNA['Raw_ID1'].notna())]
snoRNA_mRNA = snoRNA_mRNA[(snoRNA_mRNA['Raw_ID2'].notna())]
snoRNA_mRNA = snoRNA_mRNA[(snoRNA_mRNA['Raw_ID1'].str[0].str.isdigit()) &
                              (snoRNA_mRNA['Raw_ID2'].str[0].str.isdigit())]

snoRNA_mRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'snoRNA-mRNA.txt', header=None, sep='\t', index=None)

In [None]:
snoRNA_pseudo = snoRNA_RNA[snoRNA_RNA['Category2']=='pseudo']
snoRNA_pseudo = snoRNA_pseudo[(snoRNA_pseudo['Raw_ID1'].notna())]
snoRNA_pseudo = snoRNA_pseudo[(snoRNA_pseudo['Raw_ID2'].notna())]
snoRNA_pseudo = snoRNA_pseudo[(snoRNA_pseudo['Raw_ID1'].str[0].str.isdigit()) &
                              (snoRNA_pseudo['Raw_ID2'].str[0].str.isdigit())]

snoRNA_pseudo[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'snoRNA-pseudogene.txt', header=None, sep='\t', index=None)

* ##### lncRNA-RNA

In [None]:
lncRNA_RNA = RNA_RNA[(RNA_RNA['Category1']=='lncRNA')]
RNA_lncRNA = RNA_RNA[(RNA_RNA['Category2']=='lncRNA')]
RNA_lncRNA.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                           'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                  inplace=True)
lncRNA_RNA = lncRNA_RNA.append(RNA_lncRNA)
lncRNA_RNA.Category2.unique()

In [None]:
lncRNA_mRNA = lncRNA_RNA[lncRNA_RNA['Category2']=='mRNA']
lncRNA_mRNA = lncRNA_mRNA[(lncRNA_mRNA['Raw_ID1'].notna())]
lncRNA_mRNA = lncRNA_mRNA[(lncRNA_mRNA['Raw_ID2'].notna())]
lncRNA_mRNA = lncRNA_mRNA[(lncRNA_mRNA['Raw_ID1'].str[0].str.isdigit()) &
                          (lncRNA_mRNA['Raw_ID2'].str[0].str.isdigit())]

lncRNA_mRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'lncRNA-mRNA.txt', header=None, sep='\t', index=None)

In [None]:
lncRNA_lncRNA = lncRNA_RNA[lncRNA_RNA['Category2']=='lncRNA']
lncRNA_lncRNA = lncRNA_lncRNA[(lncRNA_lncRNA['Raw_ID1'].notna())]
lncRNA_lncRNA = lncRNA_lncRNA[(lncRNA_lncRNA['Raw_ID2'].notna())]
lncRNA_lncRNA = lncRNA_lncRNA[(lncRNA_lncRNA['Raw_ID1'].str[0].str.isdigit()) &
                              (lncRNA_lncRNA['Raw_ID2'].str[0].str.isdigit())]

lncRNA_lncRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'lncRNA-lncRNA.txt', header=None, sep='\t', index=None)

In [None]:
lncRNA_rRNA = lncRNA_RNA[lncRNA_RNA['Category2']=='rRNA']
lncRNA_rRNA = lncRNA_rRNA[(lncRNA_rRNA['Raw_ID1'].notna())]
lncRNA_rRNA = lncRNA_rRNA[(lncRNA_rRNA['Raw_ID2'].notna())]
lncRNA_rRNA = lncRNA_rRNA[(lncRNA_rRNA['Raw_ID1'].str[0].str.isdigit()) &
                          (lncRNA_rRNA['Raw_ID2'].str[0].str.isdigit())]

lncRNA_rRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'lncRNA-rRNA.txt', header=None, sep='\t', index=None)

In [None]:
lncRNA_pseudo = lncRNA_RNA[lncRNA_RNA['Category2']=='pseudo']
lncRNA_pseudo = lncRNA_pseudo[(lncRNA_pseudo['Raw_ID1'].notna())]
lncRNA_pseudo = lncRNA_pseudo[(lncRNA_pseudo['Raw_ID2'].notna())]
lncRNA_pseudo = lncRNA_pseudo[(lncRNA_pseudo['Raw_ID1'].str[0].str.isdigit()) &
                              (lncRNA_pseudo['Raw_ID2'].str[0].str.isdigit())]

lncRNA_pseudo[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'lncRNA-pseudogene.txt', header=None, sep='\t', index=None)

In [None]:
lncRNA_protein = lncRNA_RNA[(lncRNA_RNA['Category2']=='protein') | (lncRNA_RNA['Category2']=='Orotein')]
lncRNA_protein = lncRNA_protein[(lncRNA_protein['Raw_ID1'].notna())]
lncRNA_protein = lncRNA_protein[(lncRNA_protein['Raw_ID2'].notna())]
lncRNA_protein = lncRNA_protein[(lncRNA_protein['Raw_ID1'].str[0].str.isdigit()) &
                                (lncRNA_protein['Raw_ID2'].str[0].str.isdigit())]

lncRNA_protein['Raw_ID2'] = lncRNA_protein['Raw_ID2'].astype(str).astype(int)

lncRNA_protein = pd.merge(lncRNA_protein, entrez_pro_map.rename(columns={0: 'Raw_ID2'}), on='Raw_ID2')
lncRNA_protein.drop(columns=['Raw_ID2'], inplace=True)

lncRNA_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'lncRNA-protein.txt', header=None, sep='\t', index=None)

In [None]:
lncRNA_ncRNA = lncRNA_RNA[lncRNA_RNA['Category2']=='ncRNA']
lncRNA_ncRNA = lncRNA_ncRNA[(lncRNA_ncRNA['Raw_ID1'].notna())]
lncRNA_ncRNA = lncRNA_ncRNA[(lncRNA_ncRNA['Raw_ID2'].notna())]
lncRNA_ncRNA = lncRNA_ncRNA[(lncRNA_ncRNA['Raw_ID1'].str[0].str.isdigit()) &
                            (lncRNA_ncRNA['Raw_ID2'].str[0].str.isdigit())]

lncRNA_ncRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'lncRNA-ncRNA.txt', header=None, sep='\t', index=None)

In [None]:
lncRNA_scaRNA = lncRNA_RNA[lncRNA_RNA['Category2']=='scaRNA']
lncRNA_scaRNA = lncRNA_scaRNA[(lncRNA_scaRNA['Raw_ID1'].notna())]
lncRNA_scaRNA = lncRNA_scaRNA[(lncRNA_scaRNA['Raw_ID2'].notna())]
lncRNA_scaRNA = lncRNA_scaRNA[(lncRNA_scaRNA['Raw_ID1'].str[0].str.isdigit()) &
                              (lncRNA_scaRNA['Raw_ID2'].str[0].str.isdigit())]

lncRNA_scaRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'lncRNA-scaRNA.txt', header=None, sep='\t', index=None)

In [None]:
lncRNA_TF = lncRNA_RNA[lncRNA_RNA['Category2']=='TF']
lncRNA_TF = lncRNA_TF[(lncRNA_TF['Raw_ID1'].notna())]
lncRNA_TF = lncRNA_TF[(lncRNA_TF['Raw_ID2'].notna())]
lncRNA_TF = lncRNA_TF[(lncRNA_TF['Raw_ID1'].str[0].str.isdigit()) &
                      (lncRNA_TF['Raw_ID2'].str[0].str.isdigit())]

lncRNA_TF['Raw_ID2'] = lncRNA_TF['Raw_ID2'].astype(str).astype(int)

lncRNA_TF = pd.merge(lncRNA_TF, entrez_pro_map.rename(columns={0: 'Raw_ID2'}), on='Raw_ID2')
lncRNA_TF.drop(columns=['Raw_ID2'], inplace=True)

lncRNA_TF[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'lncRNA-TF.txt', header=None, sep='\t', index=None)

In [None]:
lncRNA_ribozyme = lncRNA_RNA[lncRNA_RNA['Category2']=='ribozyme']
lncRNA_ribozyme = lncRNA_ribozyme[(lncRNA_ribozyme['Raw_ID1'].notna())]
lncRNA_ribozyme = lncRNA_ribozyme[(lncRNA_ribozyme['Raw_ID2'].notna())]
lncRNA_ribozyme = lncRNA_ribozyme[(lncRNA_ribozyme['Raw_ID1'].str[0].str.isdigit()) &
                                  (lncRNA_ribozyme['Raw_ID2'].str[0].str.isdigit())]

lncRNA_ribozyme[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'lncRNA-ribozyme.txt', header=None, sep='\t', index=None)

* ##### eRNA-RNA

In [None]:
eRNA_RNA = RNA_RNA[(RNA_RNA['Category1']=='eRNA')]
RNA_eRNA = RNA_RNA[(RNA_RNA['Category2']=='eRNA')]
RNA_eRNA.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                           'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                inplace=True)
eRNA_RNA = eRNA_RNA.append(RNA_eRNA)
eRNA_RNA.Category2.unique()

In [None]:
eRNA_RNA[['Interactor1.Symbol','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'eRNA-mRNA.txt', header=None, sep='\t', index=None)

* ##### circRNA-RNA

In [None]:
circRNA_RNA = RNA_RNA[(RNA_RNA['Category1']=='circRNA')]
RNA_circRNA = RNA_RNA[(RNA_RNA['Category2']=='circRNA')]
RNA_circRNA.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                           'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                inplace=True)
circRNA_RNA = circRNA_RNA.append(RNA_circRNA)
circRNA_RNA.Category2.unique() 
# circRNA-mRNA contains no NCBI IDs for circRNA

* ##### scRNA-RNA

In [None]:
scRNA_RNA = RNA_RNA[(RNA_RNA['Category1']=='scRNA')]
RNA_scRNA = RNA_RNA[(RNA_RNA['Category2']=='scRNA')]
RNA_scRNA.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                           'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                inplace=True)
scRNA_RNA = scRNA_RNA.append(RNA_scRNA)
scRNA_RNA.Category2.unique()

In [None]:
scRNA_mRNA = scRNA_RNA[scRNA_RNA.Category2 == 'mRNA']
scRNA_mRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'scRNA-mRNA.txt', header=None, sep='\t', index=None)

* ##### unknownRNA-RNA

In [None]:
unknownRNA_RNA = RNA_RNA[(RNA_RNA['Category1']=='unknown')]
RNA_unknownRNA = RNA_RNA[(RNA_RNA['Category2']=='unknown')]
RNA_unknownRNA.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                           'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                      inplace=True)
unknownRNA_RNA = unknownRNA_RNA.append(RNA_unknownRNA)
unknownRNA_RNA.Category2.unique() 

* ##### TF-RNA

In [None]:
TF_RNA = RNA_RNA[(RNA_RNA['Category1']=='TF')]
RNA_TF = RNA_RNA[(RNA_RNA['Category2']=='TF')]
RNA_TF.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                       'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                       inplace=True)
TF_RNA = RNA_TF.append(RNA_TF)
TF_RNA.Category2.unique() 

* ##### pseudogene-RNA

In [None]:
pseudo_RNA = RNA_RNA[(RNA_RNA['Category1']=='pseudo')]
RNA_pseudo = RNA_RNA[(RNA_RNA['Category2']=='pseudo')]
RNA_pseudo.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                       'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                       inplace=True)
pseudo_RNA = RNA_pseudo.append(RNA_pseudo)
pseudo_RNA.Category2.unique() 

In [None]:
pseudo_mRNA = pseudo_RNA[pseudo_RNA['Category2']=='mRNA']
pseudo_mRNA = pseudo_mRNA[(pseudo_mRNA['Raw_ID1'].notna())]
pseudo_mRNA = pseudo_mRNA[(pseudo_mRNA['Raw_ID2'].notna())]
pseudo_mRNA = pseudo_mRNA[(pseudo_mRNA['Raw_ID1'].str[0].str.isdigit()) &
                          (pseudo_mRNA['Raw_ID2'].str[0].str.isdigit())]

pseudo_mRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'pseudogene-mRNA.txt', header=None, sep='\t', index=None)

In [None]:
pseudo_pseudo = pseudo_RNA[pseudo_RNA['Category2']=='pseudo']
pseudo_pseudo = pseudo_pseudo[(pseudo_pseudo['Raw_ID1'].notna())]
pseudo_pseudo = pseudo_pseudo[(pseudo_pseudo['Raw_ID2'].notna())]
pseudo_pseudo = pseudo_pseudo[(pseudo_pseudo['Raw_ID1'].str[0].str.isdigit()) &
                              (pseudo_pseudo['Raw_ID2'].str[0].str.isdigit())]

pseudo_pseudo[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'pseudogene-pseudogene.txt', header=None, sep='\t', index=None)

In [None]:
pseudo_rRNA = pseudo_RNA[pseudo_RNA['Category2']=='rRNA']
pseudo_rRNA = pseudo_rRNA[(pseudo_rRNA['Raw_ID1'].notna())]
pseudo_rRNA = pseudo_rRNA[(pseudo_rRNA['Raw_ID2'].notna())]
pseudo_rRNA = pseudo_rRNA[(pseudo_rRNA['Raw_ID1'].str[0].str.isdigit()) &
                          (pseudo_rRNA['Raw_ID2'].str[0].str.isdigit())]

pseudo_rRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'pseudogene-rRNA.txt', header=None, sep='\t', index=None)

* ##### ribozyme-RNA

In [None]:
ribozyme_RNA = RNA_RNA[(RNA_RNA['Category1']=='ribozyme')]
RNA_ribozyme = RNA_RNA[(RNA_RNA['Category2']=='ribozyme')]
RNA_ribozyme.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                       'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                       inplace=True)
ribozyme_RNA = RNA_ribozyme.append(RNA_ribozyme)
ribozyme_RNA.Category2.unique() 

* ##### mRNA-RNA

In [None]:
mRNA_RNA = RNA_RNA[(RNA_RNA['Category1']=='mRNA')]
RNA_mRNA = RNA_RNA[(RNA_RNA['Category2']=='mRNA')]
RNA_mRNA.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                       'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                       inplace=True)
mRNA_RNA = mRNA_RNA.append(RNA_mRNA)
mRNA_RNA.Category2.unique() 

In [None]:
mRNA_mRNA = mRNA_RNA[mRNA_RNA['Category2']=='mRNA']
mRNA_mRNA = mRNA_mRNA[(mRNA_mRNA['Raw_ID1'].notna())]
mRNA_mRNA = mRNA_mRNA[(mRNA_mRNA['Raw_ID2'].notna())]
mRNA_mRNA = mRNA_mRNA[(mRNA_mRNA['Raw_ID1'].str[0].str.isdigit()) &
                      (mRNA_mRNA['Raw_ID2'].str[0].str.isdigit())]

mRNA_mRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'mRNA-mRNA.txt', header=None, sep='\t', index=None)

In [None]:
mRNA_rRNA = mRNA_RNA[mRNA_RNA['Category2']=='rRNA']
mRNA_rRNA = mRNA_rRNA[(mRNA_rRNA['Raw_ID1'].notna())]
mRNA_rRNA = mRNA_rRNA[(mRNA_rRNA['Raw_ID2'].notna())]
mRNA_rRNA = mRNA_rRNA[(mRNA_rRNA['Raw_ID1'].str[0].str.isdigit()) &
                      (mRNA_rRNA['Raw_ID2'].str[0].str.isdigit())]

mRNA_rRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'mRNA-rRNA.txt', header=None, sep='\t', index=None)

In [None]:
mRNA_ncRNA = mRNA_RNA[mRNA_RNA['Category2']=='ncRNA']
mRNA_ncRNA = mRNA_ncRNA[(mRNA_ncRNA['Raw_ID1'].notna())]
mRNA_ncRNA = mRNA_ncRNA[(mRNA_ncRNA['Raw_ID2'].notna())]
mRNA_ncRNA = mRNA_ncRNA[(mRNA_ncRNA['Raw_ID1'].str[0].str.isdigit()) &
                        (mRNA_ncRNA['Raw_ID2'].str[0].str.isdigit())]

mRNA_ncRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'mRNA-ncRNA.txt', header=None, sep='\t', index=None)

In [None]:
mRNA_scaRNA = mRNA_RNA[mRNA_RNA['Category2']=='scaRNA']
mRNA_scaRNA = mRNA_scaRNA[(mRNA_scaRNA['Raw_ID1'].notna())]
mRNA_scaRNA = mRNA_scaRNA[(mRNA_scaRNA['Raw_ID2'].notna())]
mRNA_scaRNA = mRNA_scaRNA[(mRNA_scaRNA['Raw_ID1'].str[0].str.isdigit()) &
                          (mRNA_scaRNA['Raw_ID2'].str[0].str.isdigit())]

mRNA_scaRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'mRNA-scaRNA.txt', header=None, sep='\t', index=None)

* ##### ncRNA-RNA

In [None]:
ncRNA_RNA = RNA_RNA[(RNA_RNA['Category1']=='ncRNA')]
RNA_ncRNA = RNA_RNA[(RNA_RNA['Category2']=='ncRNA')]
RNA_ncRNA.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                       'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                       inplace=True)
ncRNA_RNA = ncRNA_RNA.append(RNA_ncRNA)
ncRNA_RNA.Category2.unique() 

* ##### ncRNA-RNA

In [None]:
rRNA_RNA = RNA_RNA[(RNA_RNA['Category1']=='rRNA')]
RNA_rRNA = RNA_RNA[(RNA_RNA['Category2']=='rRNA')]
RNA_rRNA.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                       'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                       inplace=True)
rRNA_RNA = rRNA_RNA.append(RNA_rRNA)
rRNA_RNA.Category2.unique() 

In [None]:
rRNA_rRNA = rRNA_RNA[rRNA_RNA['Category2']=='rRNA']
rRNA_rRNA = rRNA_rRNA[(rRNA_rRNA['Raw_ID1'].notna())]
rRNA_rRNA = rRNA_rRNA[(rRNA_rRNA['Raw_ID2'].notna())]
rRNA_rRNA = rRNA_rRNA[(rRNA_rRNA['Raw_ID1'].str[0].str.isdigit()) &
                      (rRNA_rRNA['Raw_ID2'].str[0].str.isdigit())]

rRNA_rRNA[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
    edge_data_location + 'rRNA-rRNA.txt', header=None, sep='\t', index=None)

In [None]:
rRNA_scaRNA = rRNA_RNA[rRNA_RNA['Category2']=='scaRNA']
rRNA_scaRNA = rRNA_scaRNA[(rRNA_scaRNA['Raw_ID1'].notna())]
rRNA_scaRNA = rRNA_scaRNA[(rRNA_scaRNA['Raw_ID2'].notna())]
rRNA_scaRNA = rRNA_scaRNA[(rRNA_scaRNA['Raw_ID1'].str[0].str.isdigit()) &
                      (rRNA_scaRNA['Raw_ID2'].str[0].str.isdigit())]
# Empty

* ##### protein-RNA

In [None]:
protein_RNA = RNA_RNA[(RNA_RNA['Category1']=='protein')]
RNA_protein = RNA_RNA[(RNA_RNA['Category2']=='protein')]
RNA_protein.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                            'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                            inplace=True)
protein_RNA = protein_RNA.append(RNA_protein)
protein_RNA.Category2.unique() 

* ##### protein-RNA

In [None]:
scaRNA_RNA = RNA_RNA[(RNA_RNA['Category1']=='scaRNA')]
RNA_scaRNA = RNA_RNA[(RNA_RNA['Category2']=='scaRNA')]
RNA_scaRNA.rename(columns={'Interactor1.Symbol':'Interactor2.Symbol','Category1':'Category2','Raw_ID1':'Raw_ID2',
                           'Interactor2.Symbol':'Interactor1.Symbol','Category2':'Category1','Raw_ID2':'Raw_ID1'},
                           inplace=True)
scaRNA_RNA = scaRNA_RNA.append(RNA_scaRNA)
scaRNA_RNA.Category2.unique() 

* #### RNA-protein

In [None]:
# http://www.rnainter.org/raidMedia/download/Download_data_RP.tar.gz
RNA_protein = pd.read_csv(unprocessed_data_location+'Download_data_RP.txt',sep='\t')
# We select only strong evidence interactions for hsa
RNA_protein = RNA_protein[(RNA_protein['score'] >= 0.2886) &
                  (RNA_protein['Species1'].str.contains('apiens')) &
                  (RNA_protein['Species2'].str.contains('apiens'))]

# We keep only entries starting with NCBI, miRBase, tRFdb, HG19_TRNAS_
RNA_protein = RNA_protein[((RNA_protein['Raw_ID1'].str.startswith('NCBI')) |
                  (RNA_protein['Raw_ID1'].str.startswith('miRBase')) |
                  (RNA_protein['Raw_ID1'].str.startswith('nm-tRNA')) |
                  (RNA_protein['Raw_ID1'].str.startswith('tRNA'))) &
                  (RNA_protein['Raw_ID2'].str.startswith('NCBI'))
                ]

RNA_protein.Raw_ID1 = RNA_protein.Raw_ID1.str.replace("NCBI:", '')
RNA_protein.Raw_ID2 = RNA_protein.Raw_ID2.str.replace("NCBI:", '')

# miRNA
RNA_protein.Raw_ID1 = RNA_protein.Raw_ID1.str.replace("miRBase:", '')
RNA_protein.Raw_ID2 = RNA_protein.Raw_ID2.str.replace("miRBase:", '')

RNA_protein['Raw_ID1'] = RNA_protein['Raw_ID1'].str.split(';')
RNA_protein = RNA_protein.explode('Raw_ID1')
RNA_protein['Raw_ID2'] = RNA_protein['Raw_ID2'].str.split(';')
RNA_protein = RNA_protein.explode('Raw_ID2')

RNA_protein = RNA_protein[(RNA_protein['Raw_ID2'].notna())]
RNA_protein['Raw_ID2'] = RNA_protein['Raw_ID2'].astype(str).astype(int)
RNA_protein = pd.merge(RNA_protein, entrez_pro_map.rename(columns={0: 'Raw_ID2'}), on='Raw_ID2')
RNA_protein.drop(columns=['Raw_ID2'], inplace=True)

i = RNA_protein[~(RNA_protein['Category1']=='miRNA')].index.values
RNA_protein.loc[i,"Raw_ID1"] = RNA_protein.loc[i,"Raw_ID1"] + '#' + RNA_protein.loc[i,"Category1"]

RNA_protein.drop(columns=['RNAInterID','Species1','Species2'],inplace=True)
RNA_protein

In [None]:
set(RNA_protein.Category2)

In [None]:
RNA_RBP = RNA_protein[RNA_protein.Category2=='RBP']
RNA_TF = RNA_protein[RNA_protein.Category2=='TF']
RNA_protein = RNA_protein[(RNA_protein.Category2=='protein') | (RNA_protein.Category2=='Protein')]

In [None]:
set(RNA_protein.Category1)

In [None]:
circRNA_protein = RNA_protein[RNA_protein['Category1']=='circRNA']
circRNA_protein = circRNA_protein[(circRNA_protein['Raw_ID1'].notna())]
circRNA_protein = circRNA_protein[(circRNA_protein['Raw_ID1'].str[0].str.isdigit())]

circRNA_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'circRNA-protein.txt', header=None, sep='\t', index=None)

In [None]:
lncRNA_protein2 = RNA_protein[RNA_protein['Category1']=='lncRNA']
lncRNA_protein2 = lncRNA_protein2[(lncRNA_protein2['Raw_ID1'].notna())]
lncRNA_protein2 = lncRNA_protein2[(lncRNA_protein2['Raw_ID1'].str[0].str.isdigit())]

lncRNA_protein = lncRNA_protein.append(lncRNA_protein2)

lncRNA_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'lncRNA-protein.txt', header=None, sep='\t', index=None)

In [None]:
mRNA_protein = RNA_protein[RNA_protein['Category1']=='mRNA']
mRNA_protein = mRNA_protein[(mRNA_protein['Raw_ID1'].notna())]
mRNA_protein = mRNA_protein[(mRNA_protein['Raw_ID1'].str[0].str.isdigit())]

mRNA_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'mRNA-protein.txt', header=None, sep='\t', index=None)

In [None]:
ncRNA_protein = RNA_protein[RNA_protein['Category1']=='ncRNA']
ncRNA_protein = ncRNA_protein[(ncRNA_protein['Raw_ID1'].notna())]
ncRNA_protein = ncRNA_protein[(ncRNA_protein['Raw_ID1'].str[0].str.isdigit())]

ncRNA_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'ncRNA-protein.txt', header=None, sep='\t', index=None)

In [None]:
othersRNA_protein = RNA_protein[RNA_protein['Category1']=='ncRNA']
othersRNA_protein = othersRNA_protein[(othersRNA_protein['Raw_ID1'].notna())]
othersRNA_protein = othersRNA_protein[(othersRNA_protein['Raw_ID1'].str[0].str.isdigit())]

othersRNA_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'othersRNA-protein.txt', header=None, sep='\t', index=None)

In [None]:
pseudo_protein = RNA_protein[RNA_protein['Category1']=='pseudo']
pseudo_protein = pseudo_protein[(pseudo_protein['Raw_ID1'].notna())]
pseudo_protein = pseudo_protein[(pseudo_protein['Raw_ID1'].str[0].str.isdigit())]

pseudo_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'pseudogene-protein.txt', header=None, sep='\t', index=None)

In [None]:
ribozyme_protein = RNA_protein[RNA_protein['Category1']=='ribozyme']
ribozyme_protein = ribozyme_protein[(ribozyme_protein['Raw_ID1'].notna())]
ribozyme_protein = ribozyme_protein[(ribozyme_protein['Raw_ID1'].str[0].str.isdigit())]

ribozyme_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'ribozyme-protein.txt', header=None, sep='\t', index=None)

In [None]:
scRNA_protein = RNA_protein[RNA_protein['Category1']=='scRNA']
scRNA_protein = scRNA_protein[(scRNA_protein['Raw_ID1'].notna())]
scRNA_protein = scRNA_protein[(scRNA_protein['Raw_ID1'].str[0].str.isdigit())]

scRNA_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'scRNA-protein.txt', header=None, sep='\t', index=None)

In [None]:
snRNA_protein = RNA_protein[RNA_protein['Category1']=='snRNA']
snRNA_protein = snRNA_protein[(snRNA_protein['Raw_ID1'].notna())]
snRNA_protein = snRNA_protein[(snRNA_protein['Raw_ID1'].str[0].str.isdigit())]

snRNA_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'snRNA-protein.txt', header=None, sep='\t', index=None)

In [None]:
snoRNA_protein = RNA_protein[RNA_protein['Category1']=='snoRNA']
snoRNA_protein = snoRNA_protein[(snoRNA_protein['Raw_ID1'].notna())]
snoRNA_protein = snoRNA_protein[(snoRNA_protein['Raw_ID1'].str[0].str.isdigit())]

snoRNA_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'snoRNA-protein.txt', header=None, sep='\t', index=None)

In [None]:
unknownRNA_protein = RNA_protein[RNA_protein['Category1']=='unknown']
unknownRNA_protein = unknownRNA_protein[(unknownRNA_protein['Raw_ID1'].notna())]
unknownRNA_protein = unknownRNA_protein[(unknownRNA_protein['Raw_ID1'].str[0].str.isdigit())]

unknownRNA_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'unknownRNA-protein.txt', header=None, sep='\t', index=None)

In [None]:
vtRNAs_protein = RNA_protein[RNA_protein['Category1']=='vtRNAs']
vtRNAs_protein = vtRNAs_protein[(vtRNAs_protein['Raw_ID1'].notna())]
vtRNAs_protein = vtRNAs_protein[(vtRNAs_protein['Raw_ID1'].str[0].str.isdigit())]

vtRNAs_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'vtRNAs-protein.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_protein2 = RNA_protein[RNA_protein['Category1']=='miRNA']
miRNA_protein2 = miRNA_protein2[(miRNA_protein2['Raw_ID1'].notna())]
miRNA_protein2 = miRNA_protein2[(miRNA_protein2['Raw_ID1'].str[0].str.isdigit())]

maturemiRNA_protein2 = miRNA_protein2[(miRNA_protein2['Raw_ID1'].str.startswith('MIMAT'))]
premiRNA_protein2 = miRNA_protein2[(miRNA_protein2['Raw_ID1'].str.startswith('MI')) &
                                   (~miRNA_protein2['Raw_ID1'].str.startswith('MIMAT'))]

maturemiRNA_protein = maturemiRNA_protein.append(maturemiRNA_protein2)
premiRNA_protein = premiRNA_protein.append(premiRNA_protein2)

maturemiRNA_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-protein.txt', header=None, sep='\t', index=None)
premiRNA_protein[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'premiRNA-protein.txt', header=None, sep='\t', index=None)

***

In [None]:
set(RNA_RBP.Category1)

In [None]:
for i in set(RNA_RBP.Category1):
    if i != 'miRNA':
        RNA_RBP_ = RNA_RBP[RNA_RBP['Category1']==i]
        RNA_RBP_ = RNA_RBP_[(RNA_RBP_['Raw_ID1'].notna())]
        RNA_RBP_ = RNA_RBP_[(RNA_RBP_['Raw_ID1'].str[0].str.isdigit())]

        if not RNA_RBP_.empty:   
            #print(i)
            #print(RNA_RBP_[['Raw_ID1',1]].drop_duplicates())
        
            RNA_RBP_[['Raw_ID1',1]].drop_duplicates().to_csv(
            edge_data_location + i + '-RBP.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_RBP = RNA_RBP[RNA_RBP['Category1']=='miRNA']
miRNA_RBP = miRNA_RBP[(miRNA_RBP['Raw_ID1'].notna())]
miRNA_RBP = miRNA_RBP[(miRNA_RBP['Raw_ID1'].str[0].str.isdigit())]
miRNA_RBP['Interactor1.Symbol'] = 'hsa-' + miRNA_RBP['Interactor1.Symbol'].str.lower().str.replace(
    'mir','mir-').str.replace('let','let-')

miRNA_RBP = pd.merge(miRNA_RBP, mirna_mirbase_map.rename(columns={0:'Interactor1.Symbol'}), on='Interactor1.Symbol')

miRNA_RBP[['1_y','1_x']].drop_duplicates().to_csv(
    edge_data_location + 'premiRNA-RBP.txt', header=None, sep='\t', index=None)

***

In [None]:
set(RNA_TF.Category1)

In [None]:
for i in set(RNA_TF.Category1):
    if i != 'miRNA':
        RNA_TF_ = RNA_TF[RNA_TF['Category1']==i]
        RNA_TF_ = RNA_TF_[(RNA_TF_['Raw_ID1'].notna())]
        RNA_TF_ = RNA_TF_[(RNA_TF_['Raw_ID1'].str[0].str.isdigit())]
        
        if i == 'lncRNA':
            RNA_TF_ = lncRNA_TF.append(RNA_TF_)
            
        if not RNA_TF_.empty:   
            #print(i)
            #print(RNA_TF_[['Raw_ID1',1]].drop_duplicates())    
            RNA_TF_[['Raw_ID1',1]].drop_duplicates().to_csv(
            edge_data_location + i + '-TF.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_TF = RNA_TF[RNA_TF['Category1']=='miRNA']
miRNA_TF = miRNA_TF[(miRNA_TF['Raw_ID1'].notna())]

maturemiRNA_TF = miRNA_TF[(miRNA_TF['Raw_ID1'].str.startswith('MIMAT'))]
premiRNA_TF = miRNA_TF[(miRNA_TF['Raw_ID1'].str.startswith('MI')) &
                                   (~miRNA_TF['Raw_ID1'].str.startswith('MIMAT'))]

maturemiRNA_TF[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-TF.txt', header=None, sep='\t', index=None)
premiRNA_TF[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'premiRNA-TF.txt', header=None, sep='\t', index=None)

* #### RNA-gene

In [None]:
# http://www.rnainter.org/raidMedia/download/Download_data_RD.tar.gz
RNA_gene = pd.read_csv(unprocessed_data_location+'Download_data_RD.txt',sep='\t')
# We select only strong evidence interactions for hsa
RNA_gene = RNA_gene[(RNA_gene['score'] >= 0.2886) &
                  (RNA_gene['Species1'].str.contains('apiens')) &
                  (RNA_gene['Species2'].str.contains('apiens'))]

# We keep only entries starting with NCBI
RNA_gene = RNA_gene[(RNA_gene['Raw_ID1'].str.startswith('NCBI')) &
                    (RNA_gene['Raw_ID2'].str.startswith('NCBI'))]

RNA_gene.Raw_ID1 = RNA_gene.Raw_ID1.str.replace("NCBI:", '')
RNA_gene.Raw_ID2 = RNA_gene.Raw_ID2.str.replace("NCBI:", '')

RNA_gene['Raw_ID1'] = RNA_gene['Raw_ID1'].str.split(';')
RNA_gene = RNA_gene.explode('Raw_ID1')
RNA_gene['Raw_ID2'] = RNA_gene['Raw_ID2'].str.split(';')
RNA_gene = RNA_gene.explode('Raw_ID2')

RNA_gene["Raw_ID1"] = RNA_gene["Raw_ID1"] + '#' + RNA_gene["Category1"]

RNA_gene.drop(columns=['Interactor1.Symbol','Interactor2.Symbol',
                       'RNAInterID','Species1','Species2'],inplace=True)
RNA_gene

In [None]:
for i in set(RNA_gene.Category1):
    RNA_gene_ = RNA_gene[RNA_gene['Category1']==i]
    RNA_gene_ = RNA_gene_[(RNA_gene_['Raw_ID1'].notna())]
    RNA_gene_ = RNA_gene_[(RNA_gene_['Raw_ID1'].str[0].str.isdigit())]
    RNA_gene_ = RNA_gene_[(RNA_gene_['Raw_ID2'].notna())]
    RNA_gene_ = RNA_gene_[(RNA_gene_['Raw_ID2'].str[0].str.isdigit())]
        
    if not RNA_gene_.empty:   
        #print(i)
        #print(RNA_gene_[['Raw_ID1','Raw_ID2']].drop_duplicates())      
        RNA_gene_[['Raw_ID1','Raw_ID2']].drop_duplicates().to_csv(
            edge_data_location + i + '-gene.txt', header=None, sep='\t', index=None)

* #### RNA-chemical

In [None]:
# http://www.rnainter.org/raidMedia/download/Download_data_RC.tar.gz
RNA_chemical = pd.read_csv(unprocessed_data_location+'Download_data_RC.txt',sep='\t')
# We select only strong evidence interactions for hsa
RNA_chemical = RNA_chemical[(RNA_chemical['score'] >= 0.2886) &
                  (RNA_chemical['Species1'].str.contains('apiens'))]

# We keep only entries starting with NCBI
RNA_chemical = RNA_chemical[(RNA_chemical['Raw_ID1'].str.startswith('NCBI')) |
                            (RNA_chemical['Raw_ID1'].str.startswith('miRBase'))]

RNA_chemical.Raw_ID1 = RNA_chemical.Raw_ID1.str.replace("NCBI:", '')
RNA_chemical.Raw_ID1 = RNA_chemical.Raw_ID1.str.replace("miRBase:", '')

RNA_chemical['Raw_ID1'] = RNA_chemical['Raw_ID1'].str.split(';')
RNA_chemical = RNA_chemical.explode('Raw_ID1')

i = RNA_chemical[~(RNA_chemical['Category1']=='miRNA')].index.values
RNA_chemical.loc[i,"Raw_ID1"] = RNA_chemical.loc[i,"Raw_ID1"] + '#' + RNA_chemical.loc[i,"Category1"]

RNA_chemical = RNA_chemical[(RNA_chemical['Interactor2.Symbol'].notna())]
RNA_chemical['Interactor2.Symbol'] = RNA_chemical['Interactor2.Symbol'].str.lower()
RNA_chemical = pd.merge(RNA_chemical, desc_chebi_map.rename(columns={0: 'Interactor2.Symbol'}),
                        on='Interactor2.Symbol')

RNA_chemical.drop(columns=['Interactor1.Symbol','Raw_ID2','Interactor2.Symbol',
                           'RNAInterID','Species1','Species2','Category2'],inplace=True)
RNA_chemical

In [None]:
for i in set(RNA_chemical.Category1):
    if i != 'miRNA':
        RNA_chemical_ = RNA_chemical[RNA_chemical['Category1']==i]
        RNA_chemical_ = RNA_chemical_[(RNA_chemical_['Raw_ID1'].notna())]
        RNA_chemical_ = RNA_chemical_[(RNA_chemical_['Raw_ID1'].str[0].str.isdigit())]

        if not RNA_chemical_.empty:   
            #print(i)
            #print(RNA_chemical_[['Raw_ID1',1]].drop_duplicates())   
            RNA_chemical_[['Raw_ID1',1]].drop_duplicates().to_csv(
                edge_data_location + i + '-chemical.txt', header=None, sep='\t', index=None)

In [None]:
miRNA_chemical = RNA_chemical[RNA_chemical['Category1']=='miRNA']
miRNA_chemical = miRNA_chemical[(miRNA_chemical['Raw_ID1'].notna())]

maturemiRNA_chemical = miRNA_chemical[(miRNA_chemical['Raw_ID1'].str.startswith('MIMAT'))]
premiRNA_chemical = miRNA_chemical[(miRNA_chemical['Raw_ID1'].str.startswith('MI')) &
                                   (~miRNA_chemical['Raw_ID1'].str.startswith('MIMAT'))]

maturemiRNA_chemical[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-chemical.txt', header=None, sep='\t', index=None)
premiRNA_chemical[['Raw_ID1',1]].drop_duplicates().to_csv(
    edge_data_location + 'premiRNA-chemical.txt', header=None, sep='\t', index=None)

* #### RNA-epigenetic modification

In [None]:
# http://www.rnainter.org/raidMedia/download/Download_data_RH.tar.gz
RNA_hisMod = pd.read_csv(unprocessed_data_location+'Download_data_RH.txt',sep='\t')
# We select only strong evidence interactions for hsa
RNA_hisMod = RNA_hisMod[(RNA_hisMod['score'] >= 0.2886) &
                  (RNA_hisMod['Species1'].str.contains('apiens'))]

# We keep only entries starting with NCBI
RNA_hisMod = RNA_hisMod[(RNA_hisMod['Raw_ID1'].str.startswith('NCBI')) |
                        (RNA_hisMod['Raw_ID1'].str.startswith('miRBase'))]

RNA_hisMod.Raw_ID1 = RNA_hisMod.Raw_ID1.str.replace("NCBI:", '')
RNA_hisMod.Raw_ID1 = RNA_hisMod.Raw_ID1.str.replace("miRBase:", '')

RNA_hisMod['Raw_ID1'] = RNA_hisMod['Raw_ID1'].str.split(';')
RNA_hisMod = RNA_hisMod.explode('Raw_ID1')

i = RNA_hisMod[~(RNA_hisMod['Category1']=='miRNA')].index.values
RNA_hisMod.loc[i,"Raw_ID1"] = RNA_hisMod.loc[i,"Raw_ID1"] + '#' + RNA_hisMod.loc[i,"Category1"]

RNA_hisMod.drop(columns=['Interactor1.Symbol','Raw_ID2','RNAInterID',
                           'Species1','Species2','Category2'],inplace=True)
RNA_hisMod

In [None]:
for i in set(RNA_hisMod.Category1):
    if i != 'miRNA':
        RNA_hisMod_ = RNA_hisMod[RNA_hisMod['Category1']==i]
        RNA_hisMod_ = RNA_hisMod_[(RNA_hisMod_['Raw_ID1'].notna())]
        RNA_hisMod_ = RNA_hisMod_[(RNA_hisMod_['Raw_ID1'].str[0].str.isdigit())]

        if not RNA_hisMod_.empty:   
            #print(i)
            #print(RNA_hisMod_[['Raw_ID1','Interactor2.Symbol']].drop_duplicates())  
            RNA_hisMod_[['Raw_ID1','Interactor2.Symbol']].drop_duplicates().to_csv(
                edge_data_location + i + '-histoneModification.txt', header=None, sep='\t', index=None)

miRNA_hisMod = RNA_hisMod[RNA_hisMod['Category1']=='miRNA']
miRNA_hisMod = miRNA_hisMod[(miRNA_hisMod['Raw_ID1'].notna())]

premiRNA_hisMod = miRNA_hisMod[(miRNA_hisMod['Raw_ID1'].str.startswith('MI')) &
                               (~miRNA_hisMod['Raw_ID1'].str.startswith('MIMAT'))]

premiRNA_hisMod[['Raw_ID1','Interactor2.Symbol']].drop_duplicates().to_csv(
    edge_data_location + 'premiRNA-histoneModification.txt', header=None, sep='\t', index=None)

***
### [RNALocate](http://www.rnalocate.org/)
RNALocate aims to provide a resource for efficient manipulation, browsing and analysis of RNA subcellular localization.

In [None]:
#http://www.rnalocate.org/download/All%20RNA%20subcellular%20localization%20data.zip
RNA_location = pd.read_csv(unprocessed_data_location+'All RNA subcellular localization data.txt',sep='\t')
# We select only strong evidence interactions for hsa
RNA_location = RNA_location[RNA_location['Species'].str.contains('apiens')]

RNA_location.RNA_category = RNA_location.RNA_category.str.replace("other", "others")

# We keep only entries starting with NCBI
RNA_location = RNA_location[(RNA_location['Gene_ID'].str.startswith('NCBI')) |
                            (RNA_location['Gene_ID'].str.startswith('miRBase')) ]

RNA_location.Gene_ID = RNA_location.Gene_ID.str.replace("NCBI:", '')
RNA_location.Gene_ID = RNA_location.Gene_ID.str.replace("miRBase:", '')

RNA_location['Gene_ID'] = RNA_location['Gene_ID'].str.split(';')
RNA_location = RNA_location.explode('Gene_ID')

RNA_location = RNA_location[(RNA_location['SubCellular_Localization'].notna())]
RNA_location['SubCellular_Localization'] = RNA_location['SubCellular_Localization'].str.lower()
RNA_location = pd.merge(RNA_location, desc_go_map.rename(columns={0: 'SubCellular_Localization'}),
                        on='SubCellular_Localization')

RNA_location.RNA_category = RNA_location.RNA_category.str.replace('Y RNA', 'Y_RNA')
i = RNA_location[(~(RNA_location['RNA_category']=='miRNA'))].index.values
RNA_location.loc[i,"Gene_ID"] = RNA_location.loc[i,"Gene_ID"] + '#' + RNA_location.loc[i,"RNA_category"]

RNA_location.drop(columns=['RNALocate_ID','Gene_Name','Gene_symbol',
                           'Species','SubCellular_Localization'],inplace=True)

RNA_location

In [None]:
miRNA_GO = RNA_location[(RNA_location['RNA_category']=='miRNA')]
miRNA_GO = miRNA_GO[(miRNA_GO['Gene_ID'].notna())]

maturemiRNA_GO = miRNA_GO[(miRNA_GO['Gene_ID'].str.startswith('MIMAT'))]
premiRNA_GO = miRNA_GO[(miRNA_GO['Gene_ID'].str.startswith('MI')) &
                       (~miRNA_GO['Gene_ID'].str.startswith('MIMAT'))]

maturemiRNA_GO[['Gene_ID',1]].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-GO.txt', header=None, sep='\t', index=None)
premiRNA_GO[['Gene_ID',1]].drop_duplicates().to_csv(
    edge_data_location + 'premiRNA-GO.txt', header=None, sep='\t', index=None)

In [None]:
for i in set(RNA_location.RNA_category):
    if i != 'miRNA':
        RNA_location_ = RNA_location[RNA_location['RNA_category']==i]
        RNA_location_ = RNA_location_[(RNA_location_['Gene_ID'].notna())]
        RNA_location_ = RNA_location_[(RNA_location_['Gene_ID'].str[0].str.isdigit())]
        
        if not RNA_location_.empty:   
            #print(i)
            #print(RNA_location_[['Gene_ID',1]].drop_duplicates())
            RNA_location_[['Gene_ID',1]].drop_duplicates().to_csv(
                edge_data_location + i + '-GO.txt', header=None, sep='\t', index=None)

***
### [RNADisease](http://www.rnadisease.org/)
RNADisease includes literature-verified RNA-disease interaction entries and uses a variety of algorithms to obtain a large amount of prediction RNA-disease data.

In [None]:
#http://www.rnadisease.org/static/download/RNADiseasev4.0_RNA-disease_experiment_all.zip
RNA_disease = pd.read_excel(unprocessed_data_location+'RNADiseasev4.0_RNA-disease_experiment_all.xlsx')
# We select only strong evidence interactions for hsa
RNA_disease = RNA_disease[RNA_disease['specise'].str.contains('apiens')]

# We keep only entries score is > 0.95 (see http://www.rnadisease.org/help Q10)
RNA_disease = RNA_disease[RNA_disease['score']>=0.95]

RNA_disease = RNA_disease[(RNA_disease['DO ID'].notna())]
RNA_disease['DO ID'] = RNA_disease['DO ID'].str.replace(':','_')
RNA_disease = pd.merge(RNA_disease, doid_mondo_map.rename(columns={0:'DO ID'}), on=['DO ID'])

RNA_disease[1] = RNA_disease[1].str.split(',')
RNA_disease = RNA_disease.explode(1)

RNA_disease.drop(columns=['RDID','specise','Disease Name',
                           'MeSH ID','KEGG disease ID','DO ID'],inplace=True)
RNA_disease

In [None]:
miRNA_disease = RNA_disease[(RNA_disease['RNA Type']=='miRNA')]
miRNA_disease = miRNA_disease[(miRNA_disease['RNA Symbol'].notna())]

miRNA_disease = pd.merge(miRNA_disease, mirna_mirbase_map.rename(columns={0: 'RNA Symbol'}),
                         on='RNA Symbol')

maturemiRNA_disease = miRNA_disease[(miRNA_disease['1_y'].str.startswith('MIMAT'))]
premiRNA_disease = miRNA_disease[(miRNA_disease['1_y'].str.startswith('MI')) &
                                 (~miRNA_disease['1_y'].str.startswith('MIMAT'))]

maturemiRNA_disease[['1_y','1_x']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-disease.txt', header=None, sep='\t', index=None)
premiRNA_disease[['1_y','1_x']].drop_duplicates().to_csv(
    edge_data_location + 'premiRNA-disease.txt', header=None, sep='\t', index=None)

In [None]:
RNA_disease = pd.merge(RNA_disease, symbol_entrez_map.rename(columns={0: 'RNA Symbol'}),
                       on='RNA Symbol')

i = RNA_disease.index.values
RNA_disease.loc[i,'1_y'] = RNA_disease.loc[i,'1_y'].astype(str) + '#' + RNA_disease.loc[i,'RNA Type']
RNA_disease

In [None]:
for i in set(RNA_disease['RNA Type']):
    RNA_disease_ = RNA_disease[RNA_disease['RNA Type']==i]

    if not RNA_disease_.empty:   
        #print(i)
        #print(RNA_disease_[['1_y','1_x']].drop_duplicates())
        RNA_disease_[['1_y','1_x']].drop_duplicates().to_csv(
            edge_data_location + i + '-disease.txt', header=None, sep='\t', index=None)

***
### [ncRDeathDB](https://www.rna-society.org/ncrdeathdb/)
ncRDeathDB includes ncRNA types associated with apoptosis, autophagy, and necrosis.

In [None]:
#https://www.rna-society.org/ncrdeathdb/data/allNcRNACelldeathData.xlsx
RNA_pDeath = pd.read_excel(unprocessed_data_location + 'allNcRNACelldeathData.xlsx', dtype={"geneid": "string"})
RNA_pDeath = RNA_pDeath[RNA_pDeath.Organism.str.contains('apiens')]
RNA_pDeath.drop(columns=['id','miRNA_symbol','miRBase_mature_ID','Gene_Symbol','Organism','tax_id','Synonyms',
                          'Links','chromosome','map_location','Description','type_of_gene','Full_name_from_nomenclature_authority',
                          'Other_designations'],inplace=True)
RNA_pDeath = RNA_pDeath[(RNA_pDeath['geneid']!='<NA>') | (RNA_pDeath['miRBase_ID'].notna())]
RNA_pDeath['miRBase_ID'] = RNA_pDeath.miRBase_ID.str.split(',')
RNA_pDeath = RNA_pDeath.explode('miRBase_ID')
RNA_pDeath

In [None]:
# Grounding
RNA_pDeath['gobp'] = RNA_pDeath['Pathway'].replace({'necrosis': 'GO_0097300', 'autophagy': 'GO_0006914',
                                                    'apoptosis': 'GO_0006915'})
RNA_pDeath.drop(columns=['Pathway'],inplace=True)
RNA_pDeath

In [None]:
miRNA_pDeath = RNA_pDeath[(RNA_pDeath['RNA Category']=='miRNA') | (RNA_pDeath['RNA Category']=='miRNA ')]
miRNA_pDeath = miRNA_pDeath[(miRNA_pDeath['miRBase_ID'].notna())]

maturemiRNA_pDeath = miRNA_pDeath[(miRNA_pDeath['miRBase_ID'].str.startswith('MIMAT'))]

maturemiRNA_pDeath[['miRBase_ID','gobp']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-pDeath.txt', header=None, sep='\t', index=None)

In [None]:
RNA_pDeath['geneid'] = RNA_pDeath['geneid'].astype(str) + '#' + RNA_pDeath['RNA Category']

for i in set(RNA_pDeath['RNA Category']):
    if (i != 'miRNA') and (i != 'miRNA '):
        RNA_pDeath_ = RNA_pDeath[RNA_pDeath['RNA Category']==i]
        RNA_pDeath_ = RNA_pDeath_[(RNA_pDeath_['geneid']!='<NA>')]
        RNA_pDeath_ = RNA_pDeath_[(RNA_pDeath_['geneid'].str[0].str.isdigit())]
        
        if not RNA_pDeath_.empty:   
            #print(i)
            #print(RNA_pDeath_[['geneid','gobp']].drop_duplicates())
            RNA_pDeath_[['geneid','gobp']].drop_duplicates().to_csv(
                edge_data_location + i + '-pDeath.txt', header=None, sep='\t', index=None)

***
### [cncRNADB](https://www.rna-society.org/cncrnadb/)
cncRNAdb is a manually curated database of experimentally supported cncRNAs, which aims to provide a resource for efficient manipulation, browsing and analysis of cncRNAs.

In [None]:
#Translated ncRNA: https://www.rna-society.org/cncrnadb/download/Translated%20ncRNA.zip
RNA_anatomy = pd.read_excel(unprocessed_data_location + 'Translated ncRNA.xlsx')
RNA_anatomy = RNA_anatomy[RNA_anatomy.Organism.str.contains('apiens')]
RNA_anatomy = RNA_anatomy[RNA_anatomy['Gene.ID'].notna()]
RNA_anatomy['Tissue/Cell'] = RNA_anatomy['Tissue/Cell'].str.split(';')
RNA_anatomy = RNA_anatomy.explode('Tissue/Cell')
RNA_anatomy['Tissue/Cell'] = RNA_anatomy['Tissue/Cell'].str.lower()
RNA_anatomy = pd.merge(RNA_anatomy, desc_bto_map.rename(columns={0: 'Tissue/Cell'}),
                       on='Tissue/Cell')
RNA_anatomy.drop(columns=['cncRNAdb.ID','Name','Chromosome','Start','End','Strand','Peptide_length',
                          'Organism','Peptide','Human.gene.stable.ID','Chimpanzee.gene.stable.ID',
                          'Mouse.gene.stable.ID','Drosophila.melanogaster.gene.stable.ID',
                          'Zebrafish.gene.stable.ID','Tissue/Cell'],inplace=True)
RNA_anatomy

In [None]:
set(RNA_anatomy['Type'])

In [None]:
RNA_anatomy['Gene.ID'] = RNA_anatomy['Gene.ID'].astype(str) + '#' + RNA_anatomy['Type']

for i in set(RNA_anatomy['Type']):
    RNA_anatomy_ = RNA_anatomy[RNA_anatomy['Type']==i]
    RNA_anatomy_ = RNA_anatomy_[(RNA_anatomy_['Gene.ID'].str[0].str.isdigit())]
    if not RNA_anatomy_.empty: 
        #print(i)
        #print(RNA_anatomy_[['Gene.ID',1]].drop_duplicates())
        RNA_anatomy_[['Gene.ID',1]].drop_duplicates().to_csv(
            edge_data_location + i + '-anatomy.txt', header=None, sep='\t', index=None)

In [None]:
#Untranslated mRNA: https://www.rna-society.org/cncrnadb/download/Untranslated%20mRNA.zip
RNA_anatomy = pd.read_excel(unprocessed_data_location + 'Regulatory mRNA.xlsx').append(
    pd.read_excel(unprocessed_data_location + 'Scaffold mRNA.xlsx')).append(
    pd.read_excel(unprocessed_data_location + 'Sponge mRNA.xlsx'))
RNA_anatomy = RNA_anatomy[RNA_anatomy.Organism.str.contains('apiens')]
RNA_anatomy = RNA_anatomy[RNA_anatomy['Entrez.ID'].notna()]
RNA_anatomy['Tissue/Cell'] = RNA_anatomy['Tissue/Cell'].str.split(';')
RNA_anatomy = RNA_anatomy.explode('Tissue/Cell')
RNA_anatomy['Tissue/Cell'] = RNA_anatomy['Tissue/Cell'].str.lower()
RNA_anatomy = pd.merge(RNA_anatomy, desc_bto_map.rename(columns={0: 'Tissue/Cell'}),
                       on='Tissue/Cell')
RNA_anatomy.drop(columns=['cncRNAdb.ID','Type','Name','Ensembl.ID','Chromosome','Start','End','Strand','Function',
                          'Organism','Human.gene.stable.ID','Chimpanzee.gene.stable.ID',
                          'Mouse.gene.stable.ID','Drosophila.melanogaster.gene.stable.ID',
                          'Zebrafish.gene.stable.ID','Tissue/Cell'],inplace=True)
RNA_anatomy['Entrez.ID'] = RNA_anatomy['Entrez.ID'].astype('Int64').astype(str) + '#mRNA'
RNA_anatomy.head()

In [None]:
RNA_anatomy[['Entrez.ID',1]].drop_duplicates().to_csv(
    edge_data_location + 'mRNA-anatomy.txt', header=None, sep='\t', index=None)

***
### [ViRBase](https://www.rna-society.org/ViRBase/)
ViRBase aims to construct complex interactions between the viral and cellular ncRNAs with their viral and cellular targets, and provide the detail RNA annotation. It will be contributed to understand viral infection and develop new antiviral therapies.

In [None]:
#http://www.rna-society.org/virbase/download/all_ncRNA_associated_interactions.zip
ViRBase = pd.read_csv(unprocessed_data_location+'all_ncRNA_associated_interactions.txt',sep='\t')
# We select only strong evidence interactions for hsa
ViRBase = ViRBase[ViRBase['Host Species'].str.contains('apiens')]

# We keep only entries score is > 0.7 (see http://www.rna-society.org/virbase/help.html Q8)
ViRBase = ViRBase[ViRBase['Score']>=0.7]

ViRBase.drop(columns=['ViRBase ID','Taxonomy ID','Virus Name','Virus Strain Name','Virus Family',
                      'Host Species','Interactor1 Symbol','Interactor2 Symbol'],inplace=True)
ViRBase

In [None]:
host_virus = ViRBase[(ViRBase['Interactor1 Source'] == 'host') & (ViRBase['Interactor2 Source'] == 'virus')]
host_virus

In [None]:
virus_host = ViRBase[(ViRBase['Interactor1 Source'] == 'virus') & (ViRBase['Interactor2 Source'] == 'host')]
virus_host.rename(columns={'Interactor1 Source':'Interactor2 Source',
                           'Interactor1 Category':'Interactor2 Category',
                           'Interactor1 ID':'Interactor2 ID',
                           'Interactor2 Source':'Interactor1 Source',
                           'Interactor2 Category':'Interactor1 Category',
                           'Interactor2 ID':'Interactor1 ID'
                          },inplace=True)
virus_host

In [None]:
ViRBase = virus_host.append(host_virus)
ViRBase['Interactor2 Category'] = 'viral_'+ViRBase['Interactor2 Category']
ViRBase.drop(columns=['Interactor2 Source','Interactor1 Source'],inplace=True)
ViRBase

In [None]:
set(ViRBase['Interactor1 Category'])

In [None]:
set(ViRBase['Interactor2 Category'])

In [None]:
i = ViRBase[(ViRBase['Interactor1 Category']=='circRNA') |
            (ViRBase['Interactor1 Category']=='lncRNA') |
            (ViRBase['Interactor1 Category']=='mRNA') |
            (ViRBase['Interactor1 Category']=='other') |
            (ViRBase['Interactor1 Category']=='pseudo') |
            (ViRBase['Interactor1 Category']=='scRNA') |
            (ViRBase['Interactor1 Category']=='snRNA') |
            (ViRBase['Interactor1 Category']=='snoRNA') |
            (ViRBase['Interactor1 Category']=='unknown')].index.values
ViRBase.loc[i,"Interactor1 ID"]=ViRBase.loc[i,"Interactor1 ID"] + '#' + ViRBase.loc[i,"Interactor1 Category"]

i = ViRBase[(ViRBase['Interactor2 Category']=='viral_lncRNA') |
            (ViRBase['Interactor2 Category']=='viral_mRNA') |
            (ViRBase['Interactor2 Category']=='viral_nsRNA') |
            (ViRBase['Interactor2 Category']=='viral_snoRNA') |
            (ViRBase['Interactor2 Category']=='viral_unassigned RNA')].index.values
ViRBase.loc[i,"Interactor2 ID"]=ViRBase.loc[i,"Interactor2 ID"] + '#' + ViRBase.loc[i,"Interactor2 Category"]
ViRBase

In [None]:
i = ViRBase[(ViRBase['Interactor1 Category']=='protein')].index.values
ViRBase_pro = ViRBase[(ViRBase['Interactor1 Category']=='protein')]
entrez_pro_map[0] = entrez_pro_map[0].astype(str)
ViRBase_pro = pd.merge(ViRBase_pro, entrez_pro_map.rename(columns={0:'Interactor1 ID'}), on=['Interactor1 ID'])
ViRBase_pro.drop(columns=['Interactor1 ID'],inplace=True)
ViRBase_pro.rename(columns={1:'Interactor1 ID'}, inplace=True)
ViRBase.drop(index=i,inplace=True,axis=0)
ViRBase = ViRBase_pro.append(ViRBase)
ViRBase

In [None]:
viralprotein_RNA = ViRBase[(ViRBase['Interactor2 Category']=='viral_protein')]
                           
viralprotein_RNA['Interactor2 ID'].replace({'1489078':'PR_P03126',
                                            '1489080':'PR_P03120',
                                            # Only papilloma type 16 is considered in PRO
                                            '3783750':'PR_P03230',
                                            '3783774':'PR_P03211',
                                            '944566':'PR_000008466',
                                            '944568':'PR_P0C6K0',
                                            'E':'PR_000036822',
                                            'M1':'PR_000049763',
                                            'NP':'PR_000049760',
                                            'NS1':'PR_000036824',
                                            'NS3':'PR_000036828',
                                            'P40':'PR_000038390',
                                            'PB1':'PR_000049745',
                                            'Pol':'PR_000044455',
                                            'env':'PR_000003225',
                                            'gag':'PR_000048976'},inplace=True)
viralprotein_RNA = viralprotein_RNA[viralprotein_RNA['Interactor2 ID'].str[0] == 'P']
viralprotein_RNA

In [None]:
i = ViRBase[(ViRBase['Interactor2 Category']=='viral_protein')].index.values
ViRBase.drop(index=i,inplace=True,axis=0)
ViRBase = viralprotein_RNA.append(ViRBase)
ViRBase

In [None]:
ViRBase = ViRBase[((ViRBase['Interactor2 ID'].str.startswith('PR'))|
                  (ViRBase['Interactor2 ID'].str.startswith('MI'))|
                  (ViRBase['Interactor2 ID'].str[0].str.isdigit()))
                  &
                  ((ViRBase['Interactor1 ID'].str.startswith('PR'))|
                  (ViRBase['Interactor1 ID'].str.startswith('MI'))|
                  (ViRBase['Interactor1 ID'].str[0].str.isdigit()))]

ViRBase['Interactor1 ID'] = ViRBase['Interactor1 ID'].str.replace("other", 'others')

i = ViRBase[(ViRBase['Interactor1 ID'].str.startswith('MI')) &
            (~(ViRBase['Interactor1 ID'].str.startswith('MIMAT')))].index.values
ViRBase.loc[i,"Interactor1 Category"]='premiRNA'

i = ViRBase[(ViRBase['Interactor2 Category'].str.startswith('MIMAT'))].index.values
ViRBase.loc[i,"Interactor2 Category"]='viral_miRNA'
viralmaturemiRNA_RNA = ViRBase[(ViRBase['Interactor2 Category']=='viral_miRNA')]

i = ViRBase[(ViRBase['Interactor2 ID'].str.startswith('MI')) &
            (~(ViRBase['Interactor2 ID'].str.startswith('MIMAT')))].index.values
ViRBase.loc[i,"Interactor2 Category"]='viral_maturemiRNA'
viralpremiRNA_RNA = ViRBase[(ViRBase['Interactor2 Category']=='viral_premiRNA')]

virallncRNA_RNA = ViRBase[(ViRBase['Interactor2 Category']=='viral_lncRNA')]
viralmRNA_RNA = ViRBase[(ViRBase['Interactor2 Category']=='viral_mRNA')]
viralnsRNA_RNA = ViRBase[(ViRBase['Interactor2 Category']=='viral_nsRNA')]
viralsnoRNA_RNA = ViRBase[(ViRBase['Interactor2 Category']=='viral_snoRNA')]
viralunRNA_RNA = ViRBase[(ViRBase['Interactor2 Category']=='viral_unassigned RNA')]
viralprotein_RNA = ViRBase[(ViRBase['Interactor2 Category']=='viral_protein')]

ViRBase

In [None]:
for i in set(virallncRNA_RNA['Interactor1 Category']):
        virallncRNA_RNA_ = virallncRNA_RNA[virallncRNA_RNA['Interactor1 Category']==i]
        if not virallncRNA_RNA_.empty :
            #print(i)
            #print(virallncRNA_RNA_[['Interactor1 ID','Interactor2 ID']].drop_duplicates())
            virallncRNA_RNA_[['Interactor1 ID','Interactor2 ID']].drop_duplicates().to_csv(
                edge_data_location + i + '-virallncRNA.txt', header=None, sep='\t', index=None)
            
for i in set(viralmRNA_RNA['Interactor1 Category']):
        viralmRNA_RNA_ = viralmRNA_RNA[viralmRNA_RNA['Interactor1 Category']==i]
        if not viralmRNA_RNA_.empty :
            #print(i)
            #print(viralmRNA_RNA_[['Interactor1 ID','Interactor2 ID']].drop_duplicates())
            viralmRNA_RNA_[['Interactor1 ID','Interactor2 ID']].drop_duplicates().to_csv(
                edge_data_location + i + '-viralmRNA.txt', header=None, sep='\t', index=None)

for i in set(viralnsRNA_RNA['Interactor1 Category']):
        viralnsRNA_RNA_ = viralnsRNA_RNA[viralnsRNA_RNA['Interactor1 Category']==i]
        if not viralnsRNA_RNA_.empty :
            #print(i)
            #print(viralnsRNA_RNA_[['Interactor1 ID','Interactor2 ID']].drop_duplicates())
            viralnsRNA_RNA_[['Interactor1 ID','Interactor2 ID']].drop_duplicates().to_csv(
                edge_data_location + i + '-viralnsRNA.txt', header=None, sep='\t', index=None)
            
for i in set(viralsnoRNA_RNA['Interactor1 Category']):
        viralsnoRNA_RNA_ = viralsnoRNA_RNA[viralsnoRNA_RNA['Interactor1 Category']==i]
        if not viralsnoRNA_RNA_.empty :
            #print(i)
            #print(viralsnoRNA_RNA_[['Interactor1 ID','Interactor2 ID']].drop_duplicates())
            viralsnoRNA_RNA_[['Interactor1 ID','Interactor2 ID']].drop_duplicates().to_csv(
                edge_data_location + i + '-viralsnoRNA.txt', header=None, sep='\t', index=None)     
            
for i in set(viralunRNA_RNA['Interactor1 Category']):
        viralunRNA_RNA_ = viralunRNA_RNA[viralunRNA_RNA['Interactor1 Category']==i]
        if not viralunRNA_RNA_.empty :
            #print(i)
            #print(viralunRNA_RNA_[['Interactor1 ID','Interactor2 ID']].drop_duplicates())
            viralunRNA_RNA_[['Interactor1 ID','Interactor2 ID']].drop_duplicates().to_csv(
                edge_data_location + i + '-viralunRNA.txt', header=None, sep='\t', index=None) 
            
for i in set(viralpremiRNA_RNA['Interactor1 Category']):
        viralpremiRNA_RNA_ = viralpremiRNA_RNA[viralpremiRNA_RNA['Interactor1 Category']==i]
        if not viralpremiRNA_RNA_.empty :
            #print(i)
            #print(viralpremiRNA_RNA_[['Interactor1 ID','Interactor2 ID']].drop_duplicates())
            viralpremiRNA_RNA_[['Interactor1 ID','Interactor2 ID']].drop_duplicates().to_csv(
                edge_data_location + i + '-viralpremiRNA.txt', header=None, sep='\t', index=None)        
            
for i in set(viralprotein_RNA['Interactor1 Category']):
        viralprotein_RNA_ = viralprotein_RNA[viralprotein_RNA['Interactor1 Category']==i]
        if not viralprotein_RNA_.empty :
            #print(i)
            #print(viralprotein_RNA_[['Interactor1 ID','Interactor2 ID']].drop_duplicates())
            viralprotein_RNA_[['Interactor1 ID','Interactor2 ID']].drop_duplicates().to_csv(
                edge_data_location + i + '-viralprotein.txt', header=None, sep='\t', index=None) 
            
for i in set(viralmaturemiRNA_RNA['Interactor1 Category']):
        viralmaturemiRNA_RNA_ = viralmaturemiRNA_RNA[viralmaturemiRNA_RNA['Interactor1 Category']==i]
        if not viralmaturemiRNA_RNA_.empty :
            #print(i)
            #print(viralmaturemiRNA_RNA_[['Interactor1 ID','Interactor2 ID']].drop_duplicates())
            viralmaturemiRNA_RNA_[['Interactor1 ID','Interactor2 ID']].drop_duplicates().to_csv(
                edge_data_location + i + '-viralmiRNA.txt', header=None, sep='\t', index=None) 

***
### [Vesciclepedia](http://microvesicles.org/index.html)
Vesiclepedia is a a manually curated compendium of molecular data (lipid, RNA and protein) identified in different classes of extracellular vesicles. 

In [None]:
# http://microvesicles.org/Archive/VESICLEPEDIA_EXPERIMENT_DETAILS_4.1.txt
experiments = pd.read_csv('http://microvesicles.org/Archive/VESICLEPEDIA_EXPERIMENT_DETAILS_4.1.txt', sep='\t')
experiments = experiments[experiments['SPECIES'].str.contains('apiens')]
# http://microvesicles.org/Archive/VESICLEPEDIA_PROTEIN_MRNA_DETAILS_4.1.txt
protein_ev = pd.read_csv(unprocessed_data_location+'VESICLEPEDIA_PROTEIN_MRNA_DETAILS_4.1.txt', sep='\t')
protein_ev = protein_ev[protein_ev['SPECIES'].str.contains('apiens')]
protein_ev = pd.merge(protein_ev, experiments, on=['EXPERIMENT ID'])
protein_ev.drop(columns=['CONTENT ID','GENE SYMBOL','SPECIES_x','EXPERIMENT ID','SPECIES_y',
                         'SAMPLE NAME','IDENTIFICATIONS','METHODS_x','YEAR'],inplace=True)
protein_ev

In [None]:
protein_ev['VESICLE TYPE'].unique()

In [None]:
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace(r".*xosomes.*", "GO_0070062", regex=True)
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace("Membrane blebs", "GO_0032059")
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace("Apoptotic bodies", "GO_0097189")
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace(r".*embrane", "GO_0016020", regex=True)
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace(r".*icrovesicles", "GO_1990742", regex=True)
protein_ev['VESICLE TYPE'] = protein_ev['VESICLE TYPE'].str.replace(r"^(?!GO_.*)", "GO_1990742", regex=True)
protein_ev['VESICLE TYPE'] = 'GO_'+protein_ev['VESICLE TYPE'].str.extract('(\d+)', expand=False)
protein_ev

In [None]:
protein_ev['CONTENT TYPE'].unique()

In [None]:
mRNA_ev = protein_ev[(protein_ev['CONTENT TYPE']=='mRNA') | (protein_ev['CONTENT TYPE']=='mrna')]
mRNA_ev.drop(columns=['CONTENT TYPE'], inplace=True)
mRNA_ev = mRNA_ev[(mRNA_ev['ENTREZ GENE ID'].notna())]
mRNA_ev = mRNA_ev[(mRNA_ev['ENTREZ GENE ID'].astype(str).str[0].str.isdigit())]
mRNA_ev['ENTREZ GENE ID'] = mRNA_ev['ENTREZ GENE ID'].astype(str) + '#mRNA'
mRNA_ev[['ENTREZ GENE ID','VESICLE TYPE']].drop_duplicates().to_csv(
    edge_data_location + 'mRNA-ev.txt', header=None, sep='\t', index=None)

In [None]:
snRNA_ev = protein_ev[protein_ev['CONTENT TYPE']=='snrna']
snRNA_ev.drop(columns=['CONTENT TYPE'], inplace=True)
snRNA_ev = snRNA_ev[(snRNA_ev['ENTREZ GENE ID'].notna())]
snRNA_ev = snRNA_ev[(snRNA_ev['ENTREZ GENE ID'].astype(str).str[0].str.isdigit())]
snRNA_ev['ENTREZ GENE ID'] = snRNA_ev['ENTREZ GENE ID'].astype(str) + '#snRNA'
snRNA_ev[['ENTREZ GENE ID','VESICLE TYPE']].drop_duplicates().to_csv(
    edge_data_location + 'snRNA-ev.txt', header=None, sep='\t', index=None)

In [None]:
protein_ev = protein_ev[(protein_ev['CONTENT TYPE']=='protein') | (protein_ev['CONTENT TYPE']=='protein ')]
protein_ev.drop(columns=['CONTENT TYPE'], inplace=True)
protein_ev = protein_ev[(protein_ev['ENTREZ GENE ID'].notna())]
protein_ev = protein_ev[(protein_ev['ENTREZ GENE ID'].astype(str).str[0].str.isdigit())]

entrez_pro_map[0] = entrez_pro_map[0].astype(int)

protein_ev = pd.merge(entrez_pro_map.rename(columns={0:'ENTREZ GENE ID'}), protein_ev, on=['ENTREZ GENE ID'])

protein_ev[[1,'VESICLE TYPE']].drop_duplicates().to_csv(
    edge_data_location + 'protein-ev.txt', header=None, sep='\t', index=None)

In [None]:
# http://microvesicles.org/Archive/VESICLEPEDIA_MIRNA_DETAILS_4.1.txt
miRNA_ev = pd.read_csv(unprocessed_data_location+'VESICLEPEDIA_MIRNA_DETAILS_4.1.txt', sep='\t')
miRNA_ev = miRNA_ev[miRNA_ev['SPECIES'].str.contains('apiens')]
miRNA_ev = pd.merge(miRNA_ev, experiments, on=['EXPERIMENT ID'])

miRNA_ev['MIRNA ID'] = 'hsa-' + miRNA_ev['MIRNA ID'].astype(str)
miRNA_ev['VESICLE TYPE'] = miRNA_ev['VESICLE TYPE'].str.replace(r".*xosomes.*", "GO_0070062", regex=True)
miRNA_ev['VESICLE TYPE'] = miRNA_ev['VESICLE TYPE'].str.replace("Membrane blebs", "GO_0032059")
miRNA_ev['VESICLE TYPE'] = miRNA_ev['VESICLE TYPE'].str.replace("Apoptotic bodies", "GO_0097189")
miRNA_ev['VESICLE TYPE'] = miRNA_ev['VESICLE TYPE'].str.replace(r".*embrane", "GO_0016020", regex=True)
miRNA_ev['VESICLE TYPE'] = miRNA_ev['VESICLE TYPE'].str.replace(r".*icrovesicles", "GO_1990742", regex=True)
miRNA_ev['VESICLE TYPE'] = miRNA_ev['VESICLE TYPE'].str.replace(r"^(?!GO_.*)", "GO_1990742", regex=True)
miRNA_ev['VESICLE TYPE'] = 'GO_'+miRNA_ev['VESICLE TYPE'].str.extract('(\d+)', expand=False)

miRNA_ev = pd.merge(mirna_mirbase_map.rename(columns={0:'MIRNA ID'}), miRNA_ev, on=['MIRNA ID'])
miRNA_ev.drop(columns=['CONTENT ID','CONTENT TYPE','COMMENTS','Entrez GENE ID','SPECIES_x',
                       'EXPERIMENT ID','SPECIES_y','SAMPLE NAME','IDENTIFICATIONS','METHODS_x',
                       'YEAR','MIRNA ID'],inplace=True)

miRNA_ev

In [None]:
maturemiRNA_ev = miRNA_ev[miRNA_ev[1].str.startswith('MIMAT')]
premiRNA_ev = miRNA_ev[~miRNA_ev[1].str.startswith('MIMAT')]
maturemiRNA_ev[[1, 'VESICLE TYPE']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-ev.txt', header=None, sep='\t', index=None)
premiRNA_ev[[1, 'VESICLE TYPE']].drop_duplicates().to_csv(
    edge_data_location + 'premiRNA-ev.txt', header=None, sep='\t', index=None)

In [None]:
# http://microvesicles.org/Archive/VESICLEPEDIA_LIPID_DETAILS_4.1.txt
lipid_ev = pd.read_csv(unprocessed_data_location+'VESICLEPEDIA_LIPID_DETAILS_4.1.txt', sep='\t')
lipid_ev = lipid_ev[lipid_ev['SPECIES'].str.contains('apiens')]
lipid_ev = pd.merge(lipid_ev, experiments, on=['EXPERIMENT ID'])
lipid_ev['LIPID ID'] = lipid_ev['LIPID ID'].str.lower()

lipid_ev['VESICLE TYPE'] = lipid_ev['VESICLE TYPE'].str.replace(r".*xosomes.*", "GO_0070062", regex=True)
lipid_ev['VESICLE TYPE'] = lipid_ev['VESICLE TYPE'].str.replace("Membrane blebs", "GO_0032059")
lipid_ev['VESICLE TYPE'] = lipid_ev['VESICLE TYPE'].str.replace("Apoptotic bodies", "GO_0097189")
lipid_ev['VESICLE TYPE'] = lipid_ev['VESICLE TYPE'].str.replace(r".*embrane", "GO_0016020", regex=True)
lipid_ev['VESICLE TYPE'] = lipid_ev['VESICLE TYPE'].str.replace(r".*icrovesicles", "GO_1990742", regex=True)
lipid_ev['VESICLE TYPE'] = lipid_ev['VESICLE TYPE'].str.replace(r"^(?!GO_.*)", "GO_1990742", regex=True)
lipid_ev['VESICLE TYPE'] = 'GO_'+lipid_ev['VESICLE TYPE'].str.extract('(\d+)', expand=False)

lipid_ev = pd.merge(desc_chebi_map.rename(columns={0:'LIPID ID'}), lipid_ev, on=['LIPID ID'])
lipid_ev.drop(columns=['LIPID ID', 'CONTENT ID','CONTENT TYPE','SPECIES_x','EXPERIMENT ID','SPECIES_y',
                       'SAMPLE NAME','IDENTIFICATIONS','METHODS_x','YEAR'], inplace=True)

lipid_ev

In [None]:
lipid_ev[[1, 'VESICLE TYPE']].drop_duplicates().to_csv(
    edge_data_location + 'lipid-ev.txt', header=None, sep='\t', index=None)

***
### [directRMDB](http://www.rnamd.org/directRMDB/index.html)
DirectRMDB is a database of quantitative RNA modification profiles.

In [None]:
DirectRMDB = pd.read_csv(unprocessed_data_location+'DirectRMDB.txt', sep='\t')
DirectRMDB.drop(columns=['seqnames','start','end','width','strand','transcripts_info','NGS_site','Ensembl_ID','miRNA_Num','Gene_Biotype'],inplace=True)
DirectRMDB2 = pd.read_csv(unprocessed_data_location+'HomoSapiens_miRNA.txt', sep='\t')
DirectRMDB2 = DirectRMDB2[DirectRMDB2['Species'].str.contains('apiens')]
DirectRMDB2.drop(columns=['seqnames','start','end','width','strand','Source','Species','Region'],inplace=True)
DirectRMDB = pd.merge(DirectRMDB, DirectRMDB2,on ='ID')

DirectRMDB = pd.merge(DirectRMDB, mirna_mirbase_map.rename(columns={0:'Name'}),on ='Name')
DirectRMDB.drop(columns=['ID','Name'],inplace=True)
DirectRMDB

In [None]:
DirectRMDB.modification.unique()

In [None]:
DirectRMDB.modification.replace({'Psi':'SO_0001373',
'm5C': 'SO_0001918',
'm6A': 'SO_0001920',
'm7G':'SO_0001326',
'AtoI': 'SO_0001274',
'm1A': 'SO_0001295',
'Cm': 'SO_0001283',
'Tm': 'SO_0001382',
'm6Am' :'SO_0001312',
'Am': 'SO_0001298',
'Gm': 'SO_0001327',
'm5U': 'SO_0001344'},inplace=True)

In [None]:
maturemiRNA_epiMod = DirectRMDB[DirectRMDB[1].str.startswith('MIMAT')]
premiRNA_epiMod = DirectRMDB[~DirectRMDB[1].str.startswith('MIMAT')]
maturemiRNA_epiMod[[1, 'modification']].drop_duplicates().to_csv(
    edge_data_location + 'miRNA-epiMod.txt', header=None, sep='\t', index=None)
premiRNA_epiMod[[1, 'modification']].drop_duplicates().to_csv(
    edge_data_location + 'premiRNA-epiMod.txt', header=None, sep='\t', index=None)

***
### [Modomics](https://genesilico.pl/modomics/)
Modomics is a database of RNA modifications that provides comprehensive information concerning the chemical structures of modified ribonucleosides, their biosynthetic pathways, the location of modified residues in RNA sequences, and RNA modifying enzymes.

In [None]:
modomics = pd.read_csv(unprocessed_data_location+'modomics.csv')
modomics['Enzymes'] = modomics['Enzymes'].str.split(' ')
modomics = modomics.explode('Enzymes')
modomics = pd.merge(modomics, symbol_to_pro.rename(columns={0:'Enzymes'}), on='Enzymes')
modomics.drop(columns=['Enzymes'],inplace=True)
modomics

In [None]:
modomics.Reaction.unique()

In [None]:
modomics.Reaction.replace({'C:m5C': 'SO_0001918',
'xX:Xm':'SO:0001353',
'A:m6A': 'SO_0001920',
'A:I': 'SO_0001274',
'C:U':'SO_1000011',
'U:Y':'SO_0001332',
'A:m1A': 'SO_0001295'},inplace=True)

modomics[[1, 'Reaction']].drop_duplicates().to_csv(
    edge_data_location + 'protein-epiMod.txt', header=None, sep='\t', index=None)

In [None]:
modomics = pd.read_csv(unprocessed_data_location+'modomics2.csv')
modomics = modomics[modomics['Organism'].str.contains('apiens')]
modomics = pd.merge(modomics, symbol_entrez_map.rename(columns={0:'ORF/Alternative name'}),
                    on ='ORF/Alternative name')
modomics['ORF/Alternative name'] = modomics['ORF/Alternative name'].astype(str).str.lower()
modomics[1] = modomics[1].astype(str) + '#' + modomics['ORF/Alternative name'].str[0:3] + 'RNA'
modomics['ORF/Alternative name'] = modomics['ORF/Alternative name'].str[0:3] + 'RNA'
modomics['Modification type'].unique()

In [None]:
modomics['Modification type'].replace({'Y':'SO_0001332','Cm': 'SO_0001283','Gm':'SO_0001327',
                                       'Am':'SO_0001298','Um':'SO_0001345'},inplace=True)

In [None]:
modomics_scaRNA = modomics[modomics['ORF/Alternative name']=='scaRNA']
modomics_snoRNA = modomics[modomics['ORF/Alternative name']=='snoRNA']
modomics_scaRNA[[1, 'Modification type']].drop_duplicates().to_csv(
    edge_data_location + 'scaRNA-epiMod.txt', header=None, sep='\t', index=None)
modomics_snoRNA[[1, 'Modification type']].drop_duplicates().to_csv(
    edge_data_location + 'snoRNA-epiMod.txt', header=None, sep='\t', index=None)

***
### Remove unprocessed raw data

In [None]:
#shutil.rmtree(unprocessed_data_location)

***
#### PheKnowLator works with at least 2 rows (we removed headers) per dataframe

In [None]:
nodes = pd.read_csv('../resources/edge_source_list.txt',sep=', ',header=None, engine='python')
nodes[1] = '../'+nodes[1].astype(str)
nodes

for i in nodes[1]:
    #Read every df
    df = pd.read_csv(i,sep='\t',header=None)
    #If df has one single row, then double it
    if len(df) == 1:
        df.append(df).to_csv(i, header=None, sep='\t', index=None)

## Non-ontology data

In [None]:
nodes = pd.read_csv('../resources/edge_source_list.txt',sep=', ',header=None, engine='python')
nodes[['A', 'B']] = nodes[0].str.split('-', 1, expand=True)
a = set(nodes['A'])
b = set(nodes['B'])
print(a.union(b))

In [None]:
# Provided by PKL ecosystem
data_downloader(processed_url+'subclass_construction_map.pkl', '../resources/construction_approach/')

# Load data, print row count, and preview it
nonO_data = pd.read_pickle(r'../resources/construction_approach/'+'subclass_construction_map.pkl')

# For instance, ncbi IDs are mapped to appropriate SO Ontology entries
list(nonO_data.items())[:5]

***
### Pseudogene sequences

In [None]:
pseudononO_data = pd.read_csv('../resources/edge_data/premiRNA-pseudogene.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/miRNA-pseudogene.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/othersRNA-pseudogene.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/snRNA-pseudogene.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/tRF-pseudogene.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/snoRNA-pseudogene.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/lncRNA-pseudogene.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/pseudogene-mRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/pseudogene-pseudogene.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/pseudogene-rRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/pseudogene-pseudogene.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/pseudogene-protein.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/pseudo-RBP.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/pseudo-TF.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/pseudo-chemical.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/pseudo-histoneModification.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/pseudo-GO.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/pseudo-disease.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/pseudo-viralmRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/pseudo-viralmiRNA.txt',sep='\t',header=None)[0]).drop_duplicates()

pseudononO_data = pd.DataFrame(pseudononO_data)
pseudononO_data['SO'] = [['SO_0000336']] * len(pseudononO_data)
pseudononO_data = pseudononO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **pseudononO_data['SO']}

***
### miRNA sequences

In [None]:
mirna_mirbase_map = pd.read_csv(processed_data_location + 'MIRNA_MIRBASE_MAP.txt', header=None, sep='\t')

mature_mirna = mirna_mirbase_map[mirna_mirbase_map[1].str.startswith('MIMAT')]
mature_mirna['SO'] = [['SO_0000276']] * len(mature_mirna)

pre_mirna = mirna_mirbase_map[~mirna_mirbase_map[1].str.startswith('MIMAT')]
pre_mirna['SO'] = [['SO_0000647']] * len(pre_mirna)

mirna_mirbase_map = pd.concat([mature_mirna, pre_mirna])

mirna_nonO = mirna_mirbase_map.drop(0, axis=1).set_index(1).to_dict()
nonO_data = {**nonO_data, **mirna_nonO['SO']}

***
### mRNA sequences

In [None]:
mRNAnonO_data = pd.read_csv('../resources/edge_data/premiRNA-mRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/miRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/othersRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/snRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/tRNA-mRNA_NCBI.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/tRNA-mRNA_gtRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/piRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/tRF-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/snoRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/eRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/scRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/pseudogene-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/mRNA-mRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/mRNA-mRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/mRNA-rRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/mRNA-ncRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/mRNA-scaRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/mRNA-protein.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/mRNA-RBP.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/mRNA-TF.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/mRNA-gene.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/mRNA-chemical.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/mRNA-histoneModification.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/mRNA-GO.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/mRNA-disease.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/mRNA-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/mRNA-viralnsRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/mRNA-viralmiRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/mRNA-ev.txt',sep='\t',header=None)[0]).drop_duplicates()

mRNAnonO_data = pd.DataFrame(mRNAnonO_data)
mRNAnonO_data['SO'] = [['SO_0000234']] * len(mRNAnonO_data)
mRNAnonO_data = mRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **mRNAnonO_data['SO']}

***
### TEC sequences

In [None]:
TECnonO_data = pd.read_csv('../resources/edge_data/TEC-chemical.txt',sep='\t',header=None)[0].drop_duplicates()

TECnonO_data = pd.DataFrame(TECnonO_data)
TECnonO_data['SO'] = [['SO_0002139']] * len(TECnonO_data)
TECnonO_data = TECnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **TECnonO_data['SO']}

***
### Y_RNA sequences

In [None]:
YnonO_data = pd.read_csv('../resources/edge_data/Y_RNA-GO.txt',sep='\t',header=None)[0].drop_duplicates()

YnonO_data = pd.DataFrame(YnonO_data)
YnonO_data['SO'] = [['SO_0000405']] * len(YnonO_data)
YnonO_data = YnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **YnonO_data['SO']}

***
### circRNA sequences

In [None]:
circRNAnonO_data = pd.read_csv('../resources/edge_data/circRNA-TF.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/circRNA-GO.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/circRNA-disease.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/miRNA-circRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/circRNA-protein.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/circRNA-RBP.txt',sep='\t',header=None)[0]).drop_duplicates()

circRNAnonO_data = pd.DataFrame(circRNAnonO_data)
circRNAnonO_data['SO'] = [['SO_0002291']] * len(circRNAnonO_data)
circRNAnonO_data = circRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **circRNAnonO_data['SO']}

***
### eRNA sequences

In [None]:
eRNAnonO_data = pd.read_csv('../resources/edge_data/eRNA-mRNA.txt',sep='\t',header=None)[0].drop_duplicates()

eRNAnonO_data = pd.DataFrame(eRNAnonO_data)
eRNAnonO_data['SO'] = [['SO_0000165']] * len(eRNAnonO_data)
eRNAnonO_data = eRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **eRNAnonO_data['SO']}

***
### Histone modifications

In [None]:
hModnonO_data = pd.read_csv('../resources/edge_data/unknown-histoneModification.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/mRNA-histoneModification.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/others-histoneModification.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/lncRNA-histoneModification.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/ncRNA-histoneModification.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/pseudo-histoneModification.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/premiRNA-histoneModification.txt',sep='\t',header=None)[1]).drop_duplicates()

hModnonO_data = pd.DataFrame(hModnonO_data)
hModnonO_data['SO'] = [['SO_0001700']] * len(hModnonO_data)
hModnonO_data = hModnonO_data.set_index(1).to_dict()
nonO_data = {**nonO_data, **hModnonO_data['SO']}

***
### lincRNA sequences

In [None]:
lincRNAnonO_data = pd.read_csv('../resources/edge_data/lincRNA-GO.txt',sep='\t',header=None)[0].drop_duplicates()

lincRNAnonO_data = pd.DataFrame(lincRNAnonO_data)
lincRNAnonO_data['SO'] = [['SO_0001463']] * len(lincRNAnonO_data)
lincRNAnonO_data = lincRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **lincRNAnonO_data['SO']}

***
### lncRNA sequences

In [None]:
lncRNAnonO_data = pd.read_csv('../resources/edge_data/premiRNA-lncRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/miRNA-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/othersRNA-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/snRNA-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/PCG-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/tRNA-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/piRNA-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/tRF-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/snoRNA-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/lncRNA-mRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-lncRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-lncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/lncRNA-rRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-pseudogene.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-protein.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-ncRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-scaRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-TF.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-ribozyme.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-RBP.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-gene.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-chemical.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-histoneModification.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-GO.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-disease.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-pDeath.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-anatomy.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-viralmRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-viralmiRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/lncRNA-viralprotein.txt',sep='\t',header=None)[0]).drop_duplicates()

lncRNAnonO_data = pd.DataFrame(lncRNAnonO_data)
lncRNAnonO_data['SO'] = [['SO_0001877']] * len(lncRNAnonO_data)
lncRNAnonO_data = lncRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **lncRNAnonO_data['SO']}

***
### mtRNA sequences

In [None]:
mtRNAnonO_data = pd.read_csv('../resources/edge_data/mtRNA-GO.txt',sep='\t',header=None)[0].drop_duplicates()

mtRNAnonO_data = pd.DataFrame(mtRNAnonO_data)
mtRNAnonO_data['SO'] = [['NCIT_C25975']] * len(mtRNAnonO_data)
mtRNAnonO_data = mtRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **mtRNAnonO_data['SO']}

***
### ncRNA sequences

In [None]:
ncRNAnonO_data = pd.read_csv('../resources/edge_data/miRNA-ncRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/lncRNA-ncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/mRNA-ncRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/ncRNA-protein.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/ncRNA-RBP.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/ncRNA-TF.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/ncRNA-histoneModification.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/ncRNA-GO.txt',sep='\t',header=None)[0]).drop_duplicates()

ncRNAnonO_data = pd.DataFrame(ncRNAnonO_data)
ncRNAnonO_data['SO'] = [['SO_0000655']] * len(ncRNAnonO_data)
ncRNAnonO_data = ncRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **ncRNAnonO_data['SO']}

***
### othersRNA sequences

In [None]:
othersRNAnonO_data = pd.read_csv('../resources/edge_data/miRNA-othersRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/othersRNA-mRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/othersRNA-lncRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/othersRNA-pseudogene.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/othersRNA-rRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/othersRNA-protein.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/others-RBP.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/others-TF.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/others-gene.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/others-histoneModification.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/others-GO.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/other-viralmiRNA.txt',sep='\t',header=None)[0]).drop_duplicates()

othersRNAnonO_data = pd.DataFrame(othersRNAnonO_data)
othersRNAnonO_data['SO'] = [['SO_0000356']] * len(othersRNAnonO_data)
othersRNAnonO_data = othersRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **othersRNAnonO_data['SO']}

***
### piRNA sequences

In [None]:
piRNAnonO_data = pd.read_csv('../resources/edge_data/piRNA-mRNA.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/piRNA-lncRNA.txt',sep='\t',header=None)[0]).drop_duplicates()

piRNAnonO_data = pd.DataFrame(piRNAnonO_data)
piRNAnonO_data['SO'] = [['SO_0001035']] * len(piRNAnonO_data)
piRNAnonO_data = piRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **piRNAnonO_data['SO']}

***
### rRNA sequences

In [None]:
rRNAnonO_data = pd.read_csv('../resources/edge_data/othersRNA-rRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/lncRNA-rRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/pseudogene-rRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/mRNA-rRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/rRNA-rRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/rRNA-rRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/rRNA-RBP.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/rRNA-TF.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/rRNA-GO.txt',sep='\t',header=None)[0]).drop_duplicates()

rRNAnonO_data = pd.DataFrame(rRNAnonO_data)
rRNAnonO_data['SO'] = [['SO_0000252']] * len(rRNAnonO_data)
rRNAnonO_data = rRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **rRNAnonO_data['SO']}

***
### Ribozyme sequences

In [None]:
ribozymenonO_data = pd.read_csv('../resources/edge_data/miRNA-ribozyme.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/lncRNA-ribozyme.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/ribozyme-protein.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/ribozyme-RBP.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/ribozyme-TF.txt',sep='\t',header=None)[0]).drop_duplicates()

ribozymenonO_data = pd.DataFrame(ribozymenonO_data)
ribozymenonO_data['SO'] = [['SO_0000374']] * len(ribozymenonO_data)
ribozymenonO_data = ribozymenonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **ribozymenonO_data['SO']}

***
### sRNA sequences

In [None]:
sRNAnonO_data = pd.read_csv('../resources/edge_data/sRNA-TF.txt',sep='\t',header=None)[0].drop_duplicates()

sRNAnonO_data = pd.DataFrame(sRNAnonO_data)
sRNAnonO_data['SO'] = [['SO_0002022']] * len(sRNAnonO_data)
sRNAnonO_data = sRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **sRNAnonO_data['SO']}

***
### scRNA sequences

In [None]:
scRNAnonO_data = pd.read_csv('../resources/edge_data/miRNA-scRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/scRNA-mRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/scRNA-protein.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/scRNA-RBP.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/scRNA-GO.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/scRNA-disease.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/scRNA-viralmiRNA.txt',sep='\t',header=None)[0]).drop_duplicates()

scRNAnonO_data = pd.DataFrame(scRNAnonO_data)
scRNAnonO_data['SO'] = [['SO_0000013']] * len(scRNAnonO_data)
scRNAnonO_data = scRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **scRNAnonO_data['SO']}

***
### scaRNA sequences

In [None]:
scaRNAnonO_data = pd.read_csv('../resources/edge_data/miRNA-scaRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/lncRNA-scaRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/mRNA-scaRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/scaRNA-RBP.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/scaRNA-TF.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/scaRNA-GO.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/scaRNA-epiMod.txt',sep='\t',header=None)[0]).drop_duplicates()

scaRNAnonO_data = pd.DataFrame(scaRNAnonO_data)
scaRNAnonO_data['SO'] = [['SO_0002095']] * len(scaRNAnonO_data)
scaRNAnonO_data = scaRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **scaRNAnonO_data['SO']}

***
### snRNA sequences

In [None]:
snRNAnonO_data = pd.read_csv('../resources/edge_data/miRNA-snRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/snRNA-snRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snRNA-snRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/snRNA-lncRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snRNA-mRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snRNA-pseudogene.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snRNA-snoRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snRNA-protein.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snRNA-RBP.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snRNA-TF.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snRNA-GO.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snRNA-viralmRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snRNA-viralmiRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snRNA-disease.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snRNA-ev.txt',sep='\t',header=None)[0]).drop_duplicates()

snRNAnonO_data = pd.DataFrame(snRNAnonO_data)
snRNAnonO_data['SO'] = [['SO_0000274']] * len(snRNAnonO_data)
snRNAnonO_data = snRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **snRNAnonO_data['SO']}

***
### snoRNA sequences

In [None]:
snoRNAnonO_data = pd.read_csv('../resources/edge_data/miRNA-snoRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/snRNA-snoRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/snoRNA-lncRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snoRNA-mRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snoRNA-pseudogene.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snoRNA-protein.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snoRNA-RBP.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snoRNA-TF.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snoRNA-chemical.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snoRNA-GO.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snoRNA-pDeath.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snoRNA-viralmiRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snoRNA-disease.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/snoRNA-epiMod.txt',sep='\t',header=None)[0]).drop_duplicates()

snoRNAnonO_data = pd.DataFrame(snoRNAnonO_data)
snoRNAnonO_data['SO'] = [['SO_0000275']] * len(snoRNAnonO_data)
snoRNAnonO_data = snoRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **snoRNAnonO_data['SO']}

***
### tRF sequences

In [None]:
tRFnonO_data = pd.read_csv('../resources/edge_data/tRF-mRNA.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/tRF-lncRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/tRF-pseudogene.txt',sep='\t',header=None)[0]).drop_duplicates()

tRFnonO_data = pd.DataFrame(tRFnonO_data)
tRFnonO_data['SO'] = [['SO_0001172']] * len(tRFnonO_data)
tRFnonO_data = tRFnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **tRFnonO_data['SO']}

***
### tRNA sequences

In [None]:
tRNAnonO_data = pd.read_csv('../resources/edge_data/tRNA-mRNA_NCBI.txt',sep='\t',header=None)[0].append(
    pd.read_csv('../resources/edge_data/tRNA-mRNA_gtRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/tRNA-lncRNA.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/tRNA-TF.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/tRNA-GO.txt',sep='\t',header=None)[0]).drop_duplicates()

tRNAnonO_data = pd.DataFrame(tRNAnonO_data)
tRNAnonO_data['SO'] = [['SO_0000253']] * len(tRNAnonO_data)
tRNAnonO_data = tRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **tRNAnonO_data['SO']}

***
### unknownRNA sequences

In [None]:
unknownRNAnonO_data = pd.read_csv('../resources/edge_data/miRNA-unknownRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/unknownRNA-protein.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/unknown-TF.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/unknown-histoneModification.txt',sep='\t',header=None)[0]).append(
    pd.read_csv('../resources/edge_data/unknown-viralmiRNA.txt',sep='\t',header=None)[0]).drop_duplicates()

unknownRNAnonO_data = pd.DataFrame(unknownRNAnonO_data)
unknownRNAnonO_data['SO'] = [['SO_0000356']] * len(unknownRNAnonO_data)
unknownRNAnonO_data = unknownRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **unknownRNAnonO_data['SO']}

***
### vRNA sequences

In [None]:
vRNAnonO_data = pd.read_csv('../resources/edge_data/vRNA-GO.txt',sep='\t',header=None)[0].drop_duplicates()

vRNAnonO_data = pd.DataFrame(vRNAnonO_data)
vRNAnonO_data['SO'] = [['SO_0001041']] * len(vRNAnonO_data)
vRNAnonO_data = vRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **vRNAnonO_data['SO']}

***
### viralmRNA sequences

In [None]:
viralmRNAnonO_data = pd.read_csv('../resources/edge_data/miRNA-viralmRNA.txt',sep='\t',header=None)[1].append(
    pd.read_csv('../resources/edge_data/snRNA-viralmRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/lncRNA-viralmRNA.txt',sep='\t',header=None)[1]).append(
    pd.read_csv('../resources/edge_data/pseudo-viralmRNA.txt',sep='\t',header=None)[1]).drop_duplicates()

viralmRNAnonO_data = pd.DataFrame(viralmRNAnonO_data)
viralmRNAnonO_data['SO'] = [['SO_0001041']] * len(viralmRNAnonO_data)
viralmRNAnonO_data = viralmRNAnonO_data.set_index(1).to_dict()
nonO_data = {**nonO_data, **viralmRNAnonO_data['SO']}

***
### viralnsRNA sequences

In [None]:
viralnsRNAnonO_data = pd.read_csv('../resources/edge_data/mRNA-viralnsRNA.txt',sep='\t',header=None)[1].drop_duplicates()

viralnsRNAnonO_data = pd.DataFrame(viralnsRNAnonO_data)
viralnsRNAnonO_data['SO'] = [['SO_0001041']] * len(viralnsRNAnonO_data)
viralnsRNAnonO_data = viralnsRNAnonO_data.set_index(1).to_dict()
nonO_data = {**nonO_data, **viralnsRNAnonO_data['SO']}

***
### vtRNAs sequences

In [None]:
vtRNAsnonO_data = pd.read_csv('../resources/edge_data/vtRNAs-protein.txt',sep='\t',header=None)[0].drop_duplicates()

vtRNAsnonO_data = pd.DataFrame(vtRNAsnonO_data)
vtRNAsnonO_data['SO'] = [['SO_0000404']] * len(vtRNAsnonO_data)
vtRNAsnonO_data = vtRNAsnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **vtRNAsnonO_data['SO']}

In [None]:
with open('../resources/construction_approach/'+'subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# DO NOT RUN, this cell is only intended to CHECK everything's OK
nonO_data = pd.read_pickle(r'../resources/construction_approach/'+'subclass_construction_map.pkl')

nonO_data.items()