# <p style="text-align: center;">RNA Knowledge Graph Build Data Preparation</p>
    
***
***

**Authors:** [ECavalleri](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=emanuele.cavalleri@studenti.unimi.it), [TJCallahan](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=callahantiff@gmail.com), [MMesiti](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=marco.mesiti@unimi.it), [GValentini](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=giorgio.valentini@unimi.it)

**GitHub Repositories:** [RNA-KG](https://github.com/AnacletoLAB/RNA-KG/), [PheKnowLator](https://github.com/callahantiff/PheKnowLator/)  
<!--- **Release:** **[v2.0.0](https://github.com/callahantiff/PheKnowLator/wiki/v2.0.0)** --->
  
<br>  
  
**Purpose:** This notebook serves as a script to download, process, map, and clean data in order to build edges for RNA-KG. For more information on the data sources utilize within this script, please see the [PheKnowLator Data Sources](https://github.com/callahantiff/PheKnowLator/wiki/v2-Data-Sources) Wiki page.

<br>

**Assumptions:**   
- Edge data downloads ➞ `./resources/edge_data`  
- Ontologies ➞ `./resources/ontologies`    
- Processed data write location ➞ `./resources/processed_data`  

<br>

**Dependencies:**   
- **Scripts**: This notebook utilizes several helper functions, which are stored in the [`data_utils.py`](https://github.com/callahantiff/PheKnowLator/blob/master/pkt_kg/utils/data_utils.py) and [`kg_utils.py`](https://github.com/callahantiff/PheKnowLator/blob/master/pkt_kg/utils/kg_utils.py) scripts. 
_____
***

## Table of Contents
***

### [Download and process Ontologies](#create-ontologies)

### [Download and create Identifier Maps ](#create-identifier-maps)   

### [Download and process Edge Datasets](#create-edges)  

____

## Set-Up Environment
_____

In [None]:
%%capture
import sys
!{sys.executable} -m pip install -r requirements.txt
sys.path.append('../')

In [None]:
# import needed libraries
import datetime
import glob
import itertools
import networkx
import numpy
import os
import pickle
import re
import requests
import tarfile
import shutil
import pandas as pd
import re

from collections import Counter
from functools import reduce
from rdflib import Graph, Namespace, URIRef, BNode, Literal
from rdflib.namespace import OWL, RDF, RDFS
from reactome2py import content
from tqdm import tqdm
from typing import Dict

from pkt_kg.utils import * 
from builds.ontology_cleaning import *

from typing import Tuple

#### Define Global Variables

In [None]:
# directory to store resources
resource_data_location = '../resources/'

# directory to use for unprocessed data
unprocessed_data_location = '../resources/processed_data/unprocessed_data/'

# directory to use for processed data
processed_data_location = '../resources/processed_data/'

# directory to write ontology data to
ontology_data_location = '../resources/ontologies/'

# directory to write edges data to
edge_data_location = '../resources/edge_data/'

# processed data url 
processed_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/processed_data/'

# original data url 
original_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/original_data/'

# owltools location
owltools_location = '../pkt_kg/libs/owltools'

In [None]:
for rel_txt in ['INVERSE_RELATIONS.txt', 'RELATIONS_LABELS.txt']:
    data_downloader(processed_url+rel_txt, '../resources/relations_data/')
    
# Load data, print row count, and preview it
ro_data_label = pd.read_csv('../resources/relations_data/'+'RELATIONS_LABELS.txt', header=0, delimiter='\t')

print('There are {edge_count} RO Relations and Labels'.format(edge_count=len(ro_data_label)))
ro_data_label.head(n=5)

***
***
## DOWNLOAD AND PROCESS ONTOLOGIES  <a class="anchor" id="create-ontologies"></a>
***
***

In [None]:
def download_ontology(ontology):
    if not os.path.exists(ontology_data_location + ontology+'_with_imports.owl'):
        command = '{} {} --merge-import-closure -o {}'
        os.system(command.format(owltools_location, 'http://purl.obolibrary.org/obo/'+ontology+'.owl',
                                 ontology_data_location + ontology + '_with_imports.owl'))

onto_list = ['ro', 'chebi', 'pr', 'mondo', 'go', 'pw']

for ontology in onto_list:
    download_ontology(ontology)

***
***
## DOWNLOAD AND CREATE MAPPING DATASETS  <a class="anchor" id="create-identifier-maps"></a>
***
***

### Mappings provided by PheKnowLator ecosystem

In [None]:
map_list = ['DISEASE_MONDO_MAP.txt', 'ENSEMBL_GENE_ENTREZ_GENE_MAP.txt',
            'ENSEMBL_TRANSCRIPT_PROTEIN_ONTOLOGY_MAP.txt',
            'ENTREZ_GENE_PRO_ONTOLOGY_MAP.txt', 'GENE_SYMBOL_ENSEMBL_TRANSCRIPT_MAP.txt',
            'MESH_CHEBI_MAP.txt', 'ENTREZ_GENE_ENSEMBL_TRANSCRIPT_MAP.txt', 'STRING_PRO_ONTOLOGY_MAP.txt',
            'UNIPROT_ACCESSION_PRO_ONTOLOGY_MAP.txt']

for map_txt in map_list:
    data_downloader(processed_url+map_txt, processed_data_location)

***
### New mappings

***
### Chemical description from ChEBI - ChEBI mapping


**Purpose:** To map Chemical description from ChEBI to ChEBI identifiers.

**Output:** None, this mapping will be used only internally.

In [None]:
# Get dbxrefs for all ontology classes' label
def gets_ontology_class_label(graph: Graph) -> Tuple:
    dbx_uris: Dict = dict()
    dbx = [x for x in graph if 'label' in str(x[1]).lower() if isinstance(x[0], URIRef)]
    for x in dbx:
        if str(x[2]).lower() in dbx_uris.keys(): dbx_uris[str(x[2]).lower()].append(str(x[0]))
        else: dbx_uris[str(x[2]).lower()] = [str(x[0])]
    dbx_type = {str(x[2]).lower(): 'DbXref' for x in dbx}

    ex_uris: Dict = dict()
    ex = [x for x in graph if 'exactmatch' in str(x[1]).lower() if isinstance([0], URIRef)]
    for x in ex:
        if str(x[2]).lower() in ex_uris.keys(): ex_uris[str(x[2]).lower()].append(str(x[0]))
        else: ex_uris[str(x[2]).lower()] = [str(x[0])]
    ex_type = {str(x[2]).lower(): 'ExactMatch' for x in ex}

    return {**dbx_uris, **ex_uris}, {**dbx_type, **ex_type}

In [None]:
chebi_graph = Graph().parse(ontology_data_location + 'chebi_with_imports.owl')

chebi_label = gets_ontology_class_label(chebi_graph)[0]
chebi_dict = {str(k): {str(i).split('/')[-1] for i in v} for k, v in chebi_label.items()}
list({**chebi_dict}.items())[:5]

with open(unprocessed_data_location + 'DESC_CHEBI_MAP.txt', 'w') as outfile:
    for k, v in {**chebi_dict}.items():
        outfile.write(str(k) + '\t' + str(v).replace('{','').replace('\'','').replace('}','') + '\n')
        
desc_chebi_map = pd.read_csv(unprocessed_data_location+'DESC_CHEBI_MAP.txt', header=None, delimiter='\t')
desc_chebi_map

***
### miRNA - miRBase mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To map miRNA and stem-loop miRNA to miRBase identifiers.

**Output:** `MIRNA_MIRBASE_MAP.txt`

Provided by [miRBase](https://www.mirbase.org/).

In [None]:
data_downloader('https://www.mirbase.org/ftp/CURRENT/aliases.txt.zip', unprocessed_data_location)
 
mirna_mirbase_map = pd.read_csv(unprocessed_data_location + 'aliases.txt', sep="\t", header=None)
mirna_mirbase_map[1] = mirna_mirbase_map[1].str[:-1]
mirna_mirbase_map

In [None]:
mirna_mirbase_map[1] = mirna_mirbase_map[1].str.split(';')
mirna_mirbase_map = mirna_mirbase_map.explode(1)
mirna_mirbase_map[[1,0]]

In [None]:
mirna_mirbase_map[[1,0]].to_csv(processed_data_location + 'MIRNA_MIRBASE_MAP.txt', header=None, sep='\t', index=None)

***
### Disease Ontology (DO) - MONDO mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To map DO identifiers to MONDO identifiers.

**Output:** `DISEASE_DOID_MONDO_MAP.txt`

In [None]:
mondo_graph = Graph().parse(ontology_data_location + 'mondo_with_imports.owl')

mondo_dbxref = gets_ontology_class_dbxrefs(mondo_graph)[0]

# Fix DOIDs (substitute : with _)
mondo_dict = {str(k).replace(':','_').upper(): {str(i).split('/')[-1].replace(':','_') for i in v} for k, v in mondo_dbxref.items() if 'doid' in str(k)}
list({**mondo_dict}.items())[:5]

In [None]:
with open(processed_data_location + 'DOID_MONDO_MAP.txt', 'w') as outfile:
    for k, v in mondo_dict.items():
        outfile.write(str(k) + '\t' + str(v).replace('{','').replace('\'','').replace('}','') + '\n')

***
### Disease description from DO - DO mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To map Disease descriptions from DO to DO identifiers.

**Output:** None, this mapping will be used only internally.

Provided by [mir2Disease](http://watson.compbio.iupui.edu:8080/miR2Disease/).

In [None]:
data_downloader('http://watson.compbio.iupui.edu:8080/miR2Disease/download/diseaseList.txt', unprocessed_data_location)
 
desc_do_map = pd.read_csv(unprocessed_data_location + 'diseaseList.txt', sep="\t")
desc_do_map.columns = ['desc', 'doid']
desc_do_map['desc'] = desc_do_map['desc'].str.lower()
desc_do_map['doid'] = desc_do_map['doid'].str.upper().str.replace(':', '_')
desc_do_map

***
### TCGA - MONDO mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To manually map the 32 TCGA cancer types to MONDO ontology.

**Output:** `TCGA_MONDO_MAP.txt`

In [None]:
cancer_mondo_map = pd.DataFrame(data=[['ACC','MONDO_0004971'],
                                 ['BLCA','MONDO_0004163'],
                                 ['BRCA','MONDO_0006256'],
                                 ['CESC','MONDO_0005131'],
                                 ['CHOL','MONDO_0019087'],
                                 ['COAD','MONDO_0002271'],
                                 ['DLBC','MONDO_0018905'],
                                 ['ESCA','MONDO_0019086'],
                                 ['GBM','MONDO_0018177'],
                                 ['HNSC','MONDO_0010150'],
                                 ['KICH','MONDO_0017885'],
                                 ['KIRC','MONDO_0005005'],
                                 ['KIRP','MONDO_0017884'],
                                 ['LGG','MONDO_0005499'],
                                 ['LIHC','MONDO_0007256'],
                                 ['LUAD','MONDO_0005061'],
                                 ['LUSC','MONDO_0005097'],
                                 ['MESO','MONDO_0005065'],
                                 ['OV','MONDO_0006046'],
                                 ['PAAD','MONDO_0006047'],
                                 ['PCPG','MONDO_0035540'],
                                 ['PRAD','MONDO_0005082'],
                                 ['READ','MONDO_0002169'],
                                 ['SARC','MONDO_0005089'],
                                 ['SKCM','MONDO_0005012'],
                                 ['STAD','MONDO_0005036'],
                                 ['TGCT','MONDO_0010108'],
                                 ['THCA','MONDO_0015075'],
                                 ['THYM','MONDO_0006456'],
                                 ['UCEC','MONDO_0000553'],
                                 ['UCS','MONDO_0006485'],
                                 ['UVM','MONDO_0006486']
                                 ])

cancer_mondo_map.to_csv(processed_data_location + 'TCGA_MONDO_MAP.txt', header=None, sep='\t', index=None)

***
### Gene symbol - PRO mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To map gene symbols to PRO identifiers.

**Output:** `GENE_SYMBOL_PRO_ONTOLOGY_MAP.txt`

In [None]:
symbol_ensembl_map = pd.read_csv(processed_data_location + 'GENE_SYMBOL_ENSEMBL_TRANSCRIPT_MAP.txt', sep="\t", header=None)
symbol_ensembl_map[[0,1]]

In [None]:
ensembl_pro_map = pd.read_csv(processed_data_location + 'ENSEMBL_TRANSCRIPT_PROTEIN_ONTOLOGY_MAP.txt', sep="\t", header=None)
ensembl_pro_map[[1,0]]

In [None]:
symbol_to_pro = pd.merge(symbol_ensembl_map[[0,1]], ensembl_pro_map[[1,0]], left_on=[1], right_on=[0])
symbol_to_pro[['0_x', '1_y']].drop_duplicates()

In [None]:
symbol_to_pro[['0_x', '1_y']].drop_duplicates().to_csv(processed_data_location+
                                                       'GENE_SYMBOL_PRO_ONTOLOGY_MAP.txt', header=None,
                                                       sep='\t', index=None)

***
### PRO label - PRO mapping


**Purpose:** To map PRO labels to PRO identifiers.

**Output:** None, this mapping will be used only internally.

In [None]:
pro_graph = Graph().parse(ontology_data_location + 'pr_with_imports.owl')

pro_label = gets_ontology_class_label(pro_graph)[0]
pro_dict = {str(k): {str(i).split('/')[-1] for i in v} for k, v in pro_label.items()}
list({**pro_dict}.items())[:5]

with open(unprocessed_data_location + 'DESC_PRO_MAP.txt', 'w') as outfile:
    for k, v in {**pro_dict}.items():
        outfile.write(str(k) + '\t' + str(v).replace('{','').replace('\'','').replace('}','') + '\n')
        
desc_pro_map = pd.read_csv(unprocessed_data_location+'DESC_PRO_MAP.txt', header=None, delimiter='\t', dtype=object)
desc_pro_map[0] = desc_pro_map[0].str.replace("human", '')
desc_pro_map[0] = desc_pro_map[0].str.replace("(", '')
desc_pro_map[0] = desc_pro_map[0].str.replace(")", '')
desc_pro_map[0] = desc_pro_map[0].str.replace(",(.*)", '')
desc_pro_map

***
### Gene symbol - ENTREZ mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To map gene symbols to ENTREZ identifiers.

**Output:** `GENE_SYMBOL_ENTREZ_ID_MAP.txt`

In [None]:
entrez_enst_map = pd.read_csv(processed_data_location + 'ENTREZ_GENE_ENSEMBL_TRANSCRIPT_MAP.txt', sep="\t", header=None)
entrez_enst_map

In [None]:
symbol_entrez_map = pd.merge(symbol_ensembl_map, entrez_enst_map, on=[1])
symbol_entrez_map[['0_x','0_y']].drop_duplicates()

In [None]:
symbol_entrez_map[['0_x','0_y']].drop_duplicates().to_csv(processed_data_location+
                                                      'GENE_SYMBOL_ENTREZ_ID_MAP.txt',
                                                      header=None, sep='\t', index=None)

***
### Stem-loop miRNA - ENTREZ mapping <a class="anchor" id="ensemblgene-entrezgene"></a>


**Purpose:** To map miRNA to ENTREZ identifiers.

**Output:** `MIRNA_SYMBOL_ENTREZ_MAP.txt`

In [None]:
miRNA_symbol_ensembl_map = symbol_ensembl_map[symbol_ensembl_map[3]=='miRNA'][[0,1]]
miRNA_symbol_ensembl_map[2] = 'hsa-mir'+miRNA_symbol_ensembl_map[0].str.replace(r'MIR', '-').str.lower()
miRNA_symbol_entrez_map = pd.merge(miRNA_symbol_ensembl_map[[2,0]], symbol_entrez_map, left_on=[0],
                                   right_on=['0_x'])[[2,'0_y']]
miRNA_symbol_entrez_map

In [None]:
miRNA_symbol_entrez_map.to_csv(processed_data_location+'MIRNA_SYMBOL_ENTREZ_MAP.txt',
                               header=None, sep='\t', index=None)

***
### tsRNA - tRNA mapping 

**Purpose:** To map tsRNA to tRNA identifiers.

**Output:** `tRNA_tsRNA_MAP.txt`

Provided by [tsRFun](https://rna.sysu.edu.cn/tsRFun/index.php).

In [None]:
data_downloader('https://rna.sysu.edu.cn/tsRFun/download/newID_20210202.txt', unprocessed_data_location)

tsRNA_tRF_map = pd.read_csv(unprocessed_data_location + 'newID_20210202.txt', sep="\t")
tsRNA_tRF_map 

In [None]:
tsRNA_tRF_map = tsRNA_tRF_map[['tRNA','tsRNAid']]
tsRNA_tRF_map

In [None]:
tsRNA_tRF_map.to_csv(processed_data_location + 'tRNA_tsRNA_MAP.txt', header=None, sep='\t', index=None)

***
### ribozyme - RFAM mapping 

**Purpose:** To map ribozyme to RFSM identifiers.

**Output:** `ribozyme_RFAM_MAP.txt`

In [None]:
ribozyme_rfam_map = pd.DataFrame(data=[['LC ribozyme','family/RF00011'],
                                 ['hammerhead ribozyme','clan/CL00010'],
                                 ['glmS ribozyme','family/RF00234'],
                                 ['HDV-F-prausnitzii','family/RF02682'],
                                 ['HDV ribozyme','family/RF00094'],
                                 ['HDV_ribozyme','family/RF00094'],
                                 ['Hairpin','family/RF00173'],
                                 ['Hammerhead_1','clan/CL00010'],
                                 ['Hammerhead_HH9','clan/CL00010'],
                                 ['Hammerhead_3','clan/CL00010'],
                                 ['Hammerhead_HH10','clan/CL00010'],
                                 ['Hammerhead_II','clan/CL00010'],
                                 ['Pistol','family/RF02679'],
                                 ['Pistol ribozyme','family/RF02679'],
                                 ['twister ribozyme','clan/CL00120'],
                                 ['Twister-P5','clan/CL00120'],
                                 ['Twister-P3','clan/CL00120'],
                                 ['RNAse P','family/RF00009']#,
                                 #['VS ribozyme',''] absent in RFAM
                                 ])

ribozyme_rfam_map.to_csv(processed_data_location + 'ribozyme_RFAM_MAP.txt', header=None, sep='\t', index=None)

***
***
## DOWNLOAD AND PROCESS EDGE DATASETS  <a class="anchor" id="create-edges"></a>
***
***

## Edges already provided by PheKnowLator ecosystem

In [None]:
for edge in ['curated_gene_disease_associations.tsv', 'CTD_chemicals_diseases.tsv', 'CTD_chem_gene_ixns.tsv',
             'COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt', '9606.protein.links.v11.0.txt',
             'CTD_chem_gene_ixns.tsv', 'ChEBI2Reactome_All_Levels.txt', 'CTD_chem_go_enriched.tsv',
             'CTD_genes_pathways.tsv', 'UniProt2Reactome_All_Levels.txt', 'goa_human.gaf',
             'gene_association.reactome', 'CTD_chem_go_enriched.tsv', 'goa_human.gaf']:
    data_downloader(original_url+edge, edge_data_location)

In [None]:
# Rename file adding relationship's identifier
os.rename(edge_data_location+'curated_gene_disease_associations.tsv',
          edge_data_location+'gene-disease.tsv')

os.rename(edge_data_location+'CTD_chemicals_diseases.tsv',
          edge_data_location+'chemical-disease.tsv')

os.rename(edge_data_location+'CTD_chem_gene_ixns.tsv',
          edge_data_location+'chemical-gene.tsv')

os.rename(edge_data_location+'COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt',
          edge_data_location+'gene-gene.txt')

os.rename(edge_data_location+'9606.protein.links.v11.0.txt',
          edge_data_location+'protein-protein.txt')

os.rename(edge_data_location+'ChEBI2Reactome_All_Levels.txt',
          edge_data_location+'chemical-pathway.txt')

os.rename(edge_data_location+'CTD_chem_go_enriched.tsv',
          edge_data_location+'chemical-gobp.tsv')

os.rename(edge_data_location+'CTD_genes_pathways.tsv',
          edge_data_location+'gene-pathway.tsv')

os.rename(edge_data_location+'UniProt2Reactome_All_Levels.txt',
          edge_data_location+'protein-pathway.tsv')

os.rename(edge_data_location+'goa_human.gaf',
          edge_data_location+'protein-gobp.gaf')

os.rename(edge_data_location+'gene_association.reactome',
          edge_data_location+'pathway-gocc.reactome')

os.rename(edge_data_location+'CTD_chem_go_enriched.tsv',
          edge_data_location+'chemical-gocc.tsv')

os.rename(edge_data_location+'goa_human.gaf',
          edge_data_location+'protein-gocc.gaf')

***
## New edges

***
### gene-miRNA

* [TarBase](https://dianalab.e-ce.uth.gr/html/diana/web/index.php?r=tarbasev8/index) <br />  DIANA-TarBase v8 is a reference database devoted to the indexing of experimentally supported microRNA (miRNA) targets.

In [None]:
data_downloader('https://dianalab.e-ce.uth.gr/downloads/tarbase_v8_data.tar.gz', unprocessed_data_location)

with tarfile.TarFile(unprocessed_data_location+'tarbase_v8_data.tar', 'r') as tar_ref:
    tar_ref.extractall(unprocessed_data_location)
    
gene_miRNA = pd.read_csv(unprocessed_data_location + 'TarBase_v8_download.txt', sep="\t",
                         dtype={"cell_line": "string"})  

# For the time being, we keep only Homo sapiens rows
gene_miRNA = gene_miRNA[gene_miRNA['species'].str.contains("Homo sapiens")]

gene_miRNA

In [None]:
gene_miRNA['geneId'] = gene_miRNA['geneId'].str.replace("\(hsa\)", '')
gene_miRNA['geneName'] = gene_miRNA['geneName'].str.replace("\(hsa\)", '')

gene_miRNA.to_csv(edge_data_location + 'gene-miRNA.txt', header=None, sep='\t', index=None)

***
### miRNA-disease

* [miR2Disease](http://watson.compbio.iupui.edu:8080/miR2Disease/) <br />miR2Disease is a manually curated database that aims at providing a comprehensive resource of miRNA deregulation in various human diseases.

In [None]:
data_downloader('http://watson.compbio.iupui.edu:8080/miR2Disease/download/AllEntries.txt', unprocessed_data_location)

miRNA_disease = pd.read_csv(unprocessed_data_location + 'AllEntries.txt', sep="\t", header=None)  
miRNA_disease

In [None]:
miRNA_disease[1] = miRNA_disease[1].str.lower()

miRNA_disease.columns = ['mirna', 'desc', 2,3,4,5]

miRNA_disease = pd.merge(desc_do_map, miRNA_disease, on=['desc'])
miRNA_disease.to_csv(edge_data_location + 'miRNA-disease.txt', header=None, sep='\t', index=None)

***
### tsRNA-miRNA

* [tsRFun](https://rna.sysu.edu.cn/tsRFun/index.php) <br /> tsRFun is a platform for tsRNA functions by High-throughput Small RNA-Seq and CLIP-Seq Data.

In [None]:
data_downloader('https://rna.sysu.edu.cn/tsRFun/download/tsRNetwork/all_hypgm_df.txt', unprocessed_data_location)

tsRNA_miRNA = pd.read_csv(unprocessed_data_location + 'all_hypgm_df.txt', sep="\t")  
tsRNA_miRNA

In [None]:
# We consider pairs with FDR < 0.01
tsRNA_miRNA = tsRNA_miRNA[tsRNA_miRNA['adj.p'] < 0.01]
# We also remove unadjusted p-val column since we have FDR
tsRNA_miRNA.drop('p', axis=1, inplace=True)

tsRNA_miRNA.to_csv(edge_data_location + 'tsRNA-miRNA.txt', header=None, sep='\t', index=None)

***
### tsRNA-disease

* [tsRFun](https://rna.sysu.edu.cn/tsRFun/index.php)

In [None]:
data_downloader('https://rna.sysu.edu.cn/tsRFun/download/tsRinCancer/allCancer_0.txt', unprocessed_data_location)

tsRNA_disease = pd.read_csv(unprocessed_data_location + 'allCancer_0.txt', sep="\t", index_col=0)  
tsRNA_disease

In [None]:
# We keep only log2FC columns
tsRNA_disease = tsRNA_disease.loc[:, tsRNA_disease.columns.str.endswith('_log2FC')]
tsRNA_disease.columns = tsRNA_disease.columns.str.replace(r'_log2FC$', '')

# tsRNA has a relationship with cancer iff |log2FC| >= 1
tsRNA_disease[abs(tsRNA_disease) < 1] = 0
tsRNA_disease

In [None]:
# We want a dataframe with 2 columns, tRF and associated cancer;
# this is an example with ACC 
tRF=[]
log2FC=[]
for index, row in tsRNA_disease.iterrows():
     if row['ACC'] != 0:
            tRF.append(index)
            log2FC.append(row['ACC'])
            
df_acc = pd.DataFrame (tRF, columns = ['tRF'])
df_acc['dis'] = 'ACC'
df_acc['log2FC'] = log2FC
df_acc

In [None]:
# Empty dataframe to store processed rows
trRF_disease = pd.DataFrame(columns = ["tRF", "dis"])

log2FC=[]
for cancer in tsRNA_disease.columns:    
    tRF=[]
    for index, row in tsRNA_disease.iterrows():
         if row[cancer] != 0:
            tRF.append(index)
            log2FC.append(row[cancer])
    
    df = pd.DataFrame (tRF, columns = ['tRF'])
    df['dis'] = cancer
    
    trRF_disease = trRF_disease.append(df)
    
trRF_disease['log2FC'] = log2FC
trRF_disease

In [None]:
trRF_disease.to_csv(edge_data_location + 'tsRNA-disease.txt', header=None, sep='\t', index=None)

***
### snoRNA-gene

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/) <br /> snoDB is an interactive database of human small nucleolar RNAs (snoRNAs) that includes up-to-date information on snoRNA features, genomic location, conservation, host gene, snoRNA-RNA targets and snoRNA abundance and provides links to other resources.

In [None]:
data_downloader('https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/download_all', unprocessed_data_location)

snoDB = pd.read_csv(unprocessed_data_location + 'download_all', sep="\t")
snoDB

In [None]:
snoDB = snoDB[snoDB['ncbi_id'].notna()]
snoDB['ncbi_id'] = pd.to_numeric(snoDB['ncbi_id'], downcast='integer') 

snoRNA_host = snoDB[['snodb_id', 'ensembl_id', 'refseq_id', 'hgnc_id', 'ncbi_id', 'host_gene_id', 'gene_name', 'host_gene_name']]
snoRNA_host = snoRNA_host[snoRNA_host['host_gene_id'].notna()]
snoRNA_host

In [None]:
snoRNA_host.to_csv(edge_data_location + 'snoRNA-gene.txt', header=None, sep='\t', index=None)

***
### snoRNA-miRNA

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_miRNA = snoDB[['snodb_id', 'ensembl_id', 'refseq_id', 'hgnc_id', 'ncbi_id', 'host_gene_id', 'gene_name', 'mirna_targets']]
snoRNA_miRNA = snoRNA_miRNA[snoRNA_miRNA['mirna_targets'].notna()]
snoRNA_miRNA

In [None]:
snoRNA_miRNA['miRBase_id'] = ['', 'hsa-mir-19b-2']
# Up to now, no miRBase ID is associated with AC008521

snoRNA_miRNA.to_csv(edge_data_location + 'snoRNA-miRNA.txt', header=None, sep='\t', index=None)

***
### snoRNA-snoRNA

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_snoRNA = snoDB[['snodb_id', 'ensembl_id', 'refseq_id', 'hgnc_id', 'ncbi_id', 'host_gene_id', 'gene_name', 'snorna_targets']]
snoRNA_snoRNA = snoRNA_snoRNA[snoRNA_snoRNA['snorna_targets'].notna()]
snoRNA_snoRNA

In [None]:
snoRNA_snoRNA['processed_targets'] = snoRNA_snoRNA.snorna_targets.str.split(';')
snoRNA_snoRNA = snoRNA_snoRNA.explode('processed_targets')
snoRNA_snoRNA.drop('snorna_targets', axis=1, inplace=True)
snoRNA_snoRNA

In [None]:
snoRNA_snoRNA.to_csv(edge_data_location + 'snoRNA-snoRNA.txt', header=None, sep='\t', index=None)

***
### snoRNA-lncRNA

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_lncRNA = snoDB[['snodb_id', 'ensembl_id', 'refseq_id', 'hgnc_id', 'ncbi_id', 'host_gene_id', 'gene_name', 'lncrna_targets']]
snoRNA_lncRNA = snoRNA_lncRNA[snoRNA_lncRNA['lncrna_targets'].notna()]

# Same reasoning of snoRNA-miRNA
snoRNA_lncRNA['processed_targets'] = snoRNA_lncRNA.lncrna_targets.str.split(';')
snoRNA_lncRNA = snoRNA_lncRNA.explode('processed_targets')
snoRNA_lncRNA.drop('lncrna_targets', axis=1, inplace=True)

snoRNA_lncRNA.to_csv(edge_data_location + 'snoRNA-lncRNA.txt', header=None, sep='\t', index=None)
snoRNA_lncRNA.head()

***
### snoRNA-snRNA

* [snoDB](https://bioinfo-scottgroup.med.usherbrooke.ca/snoDB/)

In [None]:
snoRNA_snRNA = snoDB[['snodb_id', 'ensembl_id', 'refseq_id', 'hgnc_id', 'ncbi_id', 'host_gene_id', 'gene_name', 'snrna_targets']]
snoRNA_snRNA = snoRNA_snRNA[snoRNA_snRNA['snrna_targets'].notna()]

# Same reasoning of snoRNA-miRNA and snoRNA-lncRNA
snoRNA_snRNA['processed_targets'] = snoRNA_snRNA.snrna_targets.str.split(';')
snoRNA_snRNA = snoRNA_snRNA.explode('processed_targets')
snoRNA_snRNA.drop('snrna_targets', axis=1, inplace=True)

snoRNA_snRNA.to_csv(edge_data_location + 'snoRNA-snRNA.txt', header=None, sep='\t', index=None)

***
### lncRNA-disease

* [LncRNADisease](http://www.rnanut.net/lncrnadisease/) <br /> LncRNADisease integrates comprehensive experimentally supported and predicted ncRNA-disease associations curated from manual literatures and other resources.

In [None]:
data_downloader('http://www.rnanut.net/lncrnadisease/static/download/experimental%20lncRNA-disease%20information.xlsx', unprocessed_data_location)

lncRNA_disease = pd.read_excel(unprocessed_data_location + 'experimental%20lncRNA-disease%20information.xlsx')  
# We keep only rows dealing with HS
lncRNA_disease = lncRNA_disease[lncRNA_disease['Species'].str.contains("sapiens")]
lncRNA_disease

In [None]:
lncRNA_disease = lncRNA_disease.rename(columns={"Disease Name": "desc"})
lncRNA_disease['desc'] = lncRNA_disease['desc'].str.lower()
lncRNA_disease = pd.merge(desc_do_map, lncRNA_disease, on=['desc'])

lncRNA_disease.to_csv(edge_data_location + 'lncRNA-disease.txt', header=None, sep='\t', index=None)

***
### circRNA-disease

* [LncRNADisease](http://www.rnanut.net/lncrnadisease/)

In [None]:
data_downloader('http://www.rnanut.net/lncrnadisease/static/download/experimental%20circRNA-disease%20information.xlsx', edge_data_location)

# Same reasoning of lncRNA-disease
circRNA_disease = pd.read_excel(edge_data_location + 'experimental%20circRNA-disease%20information.xlsx')  
circRNA_disease = circRNA_disease[circRNA_disease['Species'].str.contains("sapiens")]

circRNA_disease = circRNA_disease.rename(columns={"Disease Name": "desc"})
circRNA_disease['desc'] = circRNA_disease['desc'].str.lower()
circRNA_disease = pd.merge(desc_do_map, circRNA_disease, on=['desc'])
circRNA_disease['ncRNA Symbol'] = circRNA_disease['ncRNA Symbol'].str.replace(r'^.*?circ-|circ', '')

circRNA_disease.to_csv(edge_data_location + 'circRNA-disease.txt', header=None, sep='\t', index=None)

***
### lncRNA-protein

* [LncBook](https://ngdc.cncb.ac.cn/lncbook/) <br /> LncBook accommodates a high-quality collection of human lncRNA genes and transcripts, and incorporates their abundant annotations at different omics levels, thereby enabling users to decipher functional signatures of lncRNAs in human diseases and different biological contexts. 

In [None]:
data_downloader('https://ngdc.cncb.ac.cn/lncbook/files/lncrna_rbp_LncBook2.0.csv.gz', unprocessed_data_location)

lncRNA_protein = pd.read_csv(unprocessed_data_location + 'lncrna_rbp_LncBook2.0.csv')  
lncRNA_protein[lncRNA_protein['Symbol']!='-']

In [None]:
lncRNA_protein.to_csv(edge_data_location + 'lncRNA-protein.txt', header=None, sep='\t', index=None)

***
### lncRNA-biological role

* [dbEssLnc](https://esslnc.pufengdu.org/home) <br /> dbEssLnc contains lncRNA annotations; data are constently added by manual screening. 

In [None]:
data_downloader('https://esslnc.pufengdu.org/data/essential%20lncRNA.json', unprocessed_data_location)
lncRNA_role = pd.read_json(unprocessed_data_location + 'essential%20lncRNA.json')
lncRNA_role

In [None]:
lncRNA_role.Role.unique()

In [None]:
# For grounding purposes
lncRNA_role.replace('Tumor suppressor gene', 'Tumor-Suppressor-Gene', inplace=True)
lncRNA_role.Name = lncRNA_role.Name.str.upper()

lncRNA_role.to_csv(edge_data_location + 'lncRNA-role.txt', header=None, sep='\t', index=None)

***
### lncRNA-cellular compartment

* [lncATLAS](https://lncatlas.crg.eu/) <br /> LncATLAS displays the subcellular localisation for GENCODE-annotated lncRNAs. This localisation is expressed in units of Relative Concentration Index (RCI) - a comparison of the concentration of a gene, per unit mass of RNA, between two cellular compartments.

In [None]:
data_downloader('https://lncatlas.crg.eu/session/014e12df4b0975891edb6d8ba3a33b0e/download/retrieveall?w=', unprocessed_data_location)
lncRNA_comp = pd.read_csv(unprocessed_data_location + '2023-05-09_lncATLAS_all_data.csv')
lncRNA_comp

In [None]:
# Data cleaning rule to estabilish relations: discard RCI below the mean
lncRNA_comp.Value.mean()

In [None]:
lncRNA_comp = lncRNA_comp[lncRNA_comp.Value >= lncRNA_comp.Value.mean()]

In [None]:
# Mapping to GO CC
lncRNA_comp['Data Type'].unique()

In [None]:
lncRNA_comp['gocc'] = lncRNA_comp['Data Type'].replace({'nucleus': 'GO_0005634', 'cytosol': 'GO_0005829',
                                                        'chromatin': 'GO_0000785', 'membrane': 'GO_0016020',
                                                        'nucleolus': 'GO_0005730', 'nucleoplasm': 'GO_0005654'})

lncRNA_comp = lncRNA_comp[lncRNA_comp['gocc'].astype(str).str.startswith('GO_')]

In [None]:
lncRNA_comp.to_csv(edge_data_location + 'lncRNA-gocc.txt', header=None, sep='\t', index=None)

***
### lncRNA-biological process

* [ncRDeathDB](https://www.rna-society.org/ncrdeathdb/) <br /> ncRDeathDB includes ncRNA types associated with apoptosis, autophagy, and necrosis.

In [None]:
data_downloader('https://www.rna-society.org/ncrdeathdb/data/allNcRNACelldeathData.xlsx', unprocessed_data_location)
lncRNA_gobp = pd.read_excel(unprocessed_data_location + 'allNcRNACelldeathData.xlsx', dtype={"geneid": "string"})
lncRNA_gobp = lncRNA_gobp[(lncRNA_gobp['RNA Category'] == 'lncRNA') & (lncRNA_gobp.Organism.str.contains('apiens'))]
lncRNA_gobp

In [None]:
lncRNA_gobp['gobp'] = lncRNA_gobp['Pathway'].replace({'necrosis': 'GO_0097300', 'autophagy': 'GO_0006914',
                                                      'apoptosis': 'GO_0006915'})

In [None]:
lncRNA_gobp[['geneid', 'gobp']].dropna().to_csv(edge_data_location + 'lncRNA-gobp.txt', header=None, sep='\t', index=None)

***
### miRNA-biological process

* [ncRDeathDB](https://www.rna-society.org/ncrdeathdb/)

In [None]:
miRNA_gobp = pd.read_excel(unprocessed_data_location + 'allNcRNACelldeathData.xlsx', dtype={"geneid": "string"})
miRNA_gobp = miRNA_gobp[(miRNA_gobp['RNA Category'] == 'miRNA') & (miRNA_gobp.Organism.str.contains('apiens'))]
miRNA_gobp['miRBase_ID'] = miRNA_gobp.miRBase_ID.str.split(',')
miRNA_gobp = miRNA_gobp.explode('miRBase_ID')
miRNA_gobp

In [None]:
miRNA_gobp['gobp'] = miRNA_gobp['Pathway'].replace({'necrosis': 'GO_0097300', 'autophagy': 'GO_0006914',
                                                    'apoptosis': 'GO_0006915'})

In [None]:
miRNA_gobp[['miRBase_ID', 'gobp']].dropna().to_csv(edge_data_location + 'miRNA-gobp.txt', header=None, sep='\t', index=None)

***
### snoRNA-biological process

* [ncRDeathDB](https://www.rna-society.org/ncrdeathdb/)

In [None]:
snoRNA_gobp = pd.read_excel(unprocessed_data_location + 'allNcRNACelldeathData.xlsx', dtype={"geneid": "string"})
snoRNA_gobp = snoRNA_gobp[(snoRNA_gobp['RNA Category'] == 'snoRNA') & (snoRNA_gobp.Organism.str.contains('apiens'))]
snoRNA_gobp

In [None]:
# CEBPA is gene ID 1050
snoRNA_gobp[['Gene_Symbol']] = '1050'

In [None]:
snoRNA_gobp.iloc[[0]]

In [None]:
snoRNA_gobp['gobp'] = snoRNA_gobp['Pathway'].replace({'necrosis': 'GO_0097300', 'autophagy': 'GO_0006914',
                                                      'apoptosis': 'GO_0006915'})

In [None]:
snoRNA_gobp[['Gene_Symbol', 'gobp']].dropna().to_csv(edge_data_location + 'snoRNA-gobp.txt', header=None, sep='\t', index=None)

***
### Stem-loop miRNA-TF(protein)

* [miRNet](https://www.mirnet.ca/)
miRNet is an easy-to-use web-based tool that offers statistical, visual and network-based approaches to help researchers understand miRNAs functions and regulatory mechanisms. miRNet offers a comprehensive tool suite to enable statistical analysis and functional interpretation of various data generated from current miRNA studies.

In [None]:
data_downloader('https://www.dropbox.com/s/78r0tazedtkhi5g/miRNet-mir-tf-hsa.csv', unprocessed_data_location)

miRNA_TF = pd.read_csv(unprocessed_data_location + 'miRNet-mir-tf-hsa.csv')  
miRNA_TF

In [None]:
miRNA_TF.to_csv(edge_data_location + 'miRNA-TF.txt', header=None, sep='\t', index=None)

***
### miRNA-chemical

* [SM2miR](http://www.jianglab.cn/SM2miR/) <br /> SM2miR is a manual curated database which collects and incorporates the experimentally validated small molecules' effects on miRNA expression from the published papers. Each entry contains the detailed information about small molecules, miRNAs and their relationships.

In [None]:
data_downloader('http://www.jianglab.cn/SM2miR/files/SM2miR3.xls', unprocessed_data_location)

miRNA_chemical = pd.read_excel(unprocessed_data_location + 'SM2miR3.xls')  
miRNA_chemical = miRNA_chemical[miRNA_chemical['Species'].str.contains('sapiens')]
miRNA_chemical

In [None]:
miRNA_chemical['miRNA'] = 'hsa-'+miRNA_chemical['miRNA']
miRNA_chemical['small melocule'] = miRNA_chemical['small melocule'].str.lower().str.replace("\(.*?\)| \(.*?\)", '').str.rstrip()

miRNA_chemical['small melocule'] = miRNA_chemical['small melocule'].str.split('+')
miRNA_chemical = miRNA_chemical.explode('small melocule')
miRNA_chemical['small melocule'] = miRNA_chemical['small melocule'].str.rstrip().str.lstrip()

# Fix join columns
miRNA_chemical = pd.merge(miRNA_chemical, desc_chebi_map, left_on=['small melocule'], right_on=[0])

miRNA_chemical

In [None]:
miRNA_chemical.to_csv(edge_data_location + 'miRNA-chemical.txt', header=None, sep='\t', index=None)

***
### gRNA-gene

* [Addgene](https://www.addgene.org/)

In [None]:
# copy-paste from https://www.addgene.org/crispr/reference/grna-sequence/#datatable
gRNA_gene = pd.read_csv(unprocessed_data_location + 'grna_addgene.txt', sep='\t', dtype = {"Plasmid ID":str})  
gRNA_gene.columns=gRNA_gene.columns.str.rstrip()
gRNA_gene = gRNA_gene[gRNA_gene['Target Species'].notna()]
gRNA_gene = gRNA_gene[gRNA_gene['Target Species'].str.contains('sapiens')]
gRNA_gene['Plasmid ID'] = 'www.addgene.org/'+gRNA_gene['Plasmid ID'].str.rstrip()
gRNA_gene['Target Gene'] = gRNA_gene['Target Gene'].str.upper().str.rstrip()
gRNA_gene

In [None]:
gRNA_gene.to_csv(edge_data_location + 'gRNA-gene.txt', header=None, sep='\t', index=None)

In [None]:
#TODO: add manually entrez ids not present in PKL's mapping
list(set(gRNA_gene['Target Gene']) - set(symbol_ensembl_map[0].unique()))

***
### ASO-gene

* [DrugBank](https://go.drugbank.com/categories/DBCAT001709) <br /> DrugBank is a comprehensive, free-to-access, online database containing information on drugs and drug targets. As both a bioinformatics and a cheminformatics resource, it combines detailed drug (i.e. chemical, pharmacological and pharmaceutical) data with comprehensive drug target (i.e. sequence, structure, and pathway) information.

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001709
ASO_gene = pd.read_csv(unprocessed_data_location + 'ASO-gene_DrugBank.txt', sep='\t') 
ASO_gene

In [None]:
ASO_gene['NCBI']=['338', 'a', '', '', '', '', '1756', '', '338', '211', '', '1756', '1756', '', '', '', '', '7276', '']
ASO_gene['DB ID']=['DB05528',
                   'DB05487', 'DB05487', 'DB05487',
                   'DB06759', 'DB06759',
                   'DB06014',
                   'DB13161',
                   'DB14713',
                   'DB15066',
                   'DB15593',
                   'DB15005',
                   'DB14984', 'DB14984', 'DB14984', 'DB14984', 'DB14984',
                   'DB16699', 'DB16699']
ASO_gene

In [None]:
ASO_gene.to_csv(edge_data_location + 'ASO-gene.txt', header=None, sep='\t', index=None)

***
### ASO-disease

* [DrugBank](https://go.drugbank.com/categories/DBCAT001709)

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001709
ASO_disease = pd.read_csv(unprocessed_data_location + 'ASO-disease_DrugBank.txt', sep='\t') 
pd.set_option('display.max_colwidth', None)
ASO_disease

In [None]:
ASO_disease=pd.DataFrame(columns=['DB ID', 'MONDO'])
ASO_disease['DB ID']=['DB05528',
                   'DB05487', 'DB05487',
                   'DB06759',
                    'DB13811',
                   'DB06014',
                   'DB13161',
                   'DB14713', 'DB14713',
                   'DB15066',
                   'DB15593',
                   'DB15005',
                   'DB14984',
                   'DB16699']
ASO_disease['MONDO']=['MONDO_0018328',
                      'MONDO_0001657', 'MONDO_0007254',
                      'MONDO_0000878',
                      '',
                      'MONDO_0010679',
                      'MONDO_0001516',
                      'MONDO_0017132', 'MONDO_0001824',
                      'MONDO_0002520',
                      'MONDO_0010679',
                      'MONDO_0010679',
                      'MONDO_0010679',
                      'MONDO_0017132'
                     ]
ASO_disease

In [None]:
ASO_disease.to_csv(edge_data_location + 'ASO-disease.txt', header=None, sep='\t', index=None)

***
### ASO-protein

* [DrugBank](https://go.drugbank.com/categories/DBCAT001709)

In [None]:
ASO_gene['PRO']=['', 'PR_000007204', 'PR_000011178', 'PR_000001754', 'PR_Q9BTL4', 'PR_Q16621',
                 '', 'PR_Q16637', '', '', 'PR_P11532', '', '', 'PR_P08684', 'PR_P20815', 'PR_P11712', 'PR_P33261', '', 'PR_P02768']
ASO_gene

In [None]:
ASO_protein = ASO_gene[['Drug', 'Target', 'Type', 'PRO', 'DB ID']]
ASO_protein

In [None]:
ASO_protein.to_csv(edge_data_location + 'ASO-protein.txt', header=None, sep='\t', index=None)

***
### siRNA-gene

* [DrugBank](https://go.drugbank.com/categories/DBCAT005484) 

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT005484
siRNA_gene = pd.read_csv(unprocessed_data_location + 'siRNA-gene_DrugBank.txt', sep='\t') 
siRNA_gene

In [None]:
siRNA_gene['NCBI']=['7276', 'a', '', '338', '54363', '', '', '7276', '']
siRNA_gene['DB ID']=['DB14582', 'DB14582', 'DB14582',
                     'DB15066',
                     'DB15935', 'DB15935', 'DB15935',
                     'DB16699', 'DB16699']
siRNA_gene

In [None]:
siRNA_gene.to_csv(edge_data_location + 'siRNA-gene.txt', header=None, sep='\t', index=None)

* [The MIT/ICBP siRNA Database](http://web.mit.edu/sirna/index.html) <br /> The MIT/ICBP siRNA Database has validated siRNA and shRNA sequences against over 100 genes.

In [None]:
ICBP = pd.read_html('http://web.mit.edu/sirna/sirnas-gene.html')
ICBP = ICBP[1]
ICBP.columns = ICBP.iloc[[0]].squeeze()
ICBP.drop(0, inplace=True)
ICBP

In [None]:
# For post-processing purposes
ICBP[['ID#']] = ICBP[['ID#']] + '.html'

ICBPsiRNA = ICBP.loc[(ICBP['siRNA'] == 'x') & (ICBP['Human'] == 'x')]
ICBPsiRNA

In [None]:
ICBPsiRNA.to_csv(edge_data_location + 'siRNA-geneICBP.txt', header=None, sep='\t', index=None)

***
### shRNA-gene

* [The MIT/ICBP siRNA Database](http://web.mit.edu/sirna/index.html)

In [None]:
ICBPshRNA = ICBP.loc[(ICBP['shRNA'] == 'x') & (ICBP['Human'] == 'x')]
ICBPshRNA

In [None]:
ICBPshRNA.to_csv(edge_data_location + 'shRNA-gene.txt', header=None, sep='\t', index=None)

***
### siRNA-disease

* [DrugBank](https://go.drugbank.com/categories/DBCAT005484) 

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT005484
siRNA_disease = pd.read_csv(unprocessed_data_location + 'siRNA-disease_DrugBank.txt', sep='\t') 
siRNA_disease

In [None]:
siRNA_disease=pd.DataFrame(columns=['DB ID', 'MONDO'])
siRNA_disease['DB ID']=['DB14582', 'DB14582',
                     'DB15066',
                     'DB15935',
                     'DB16699', 'DB16699']
siRNA_disease['MONDO']=['MONDO_0017132', 'MONDO_0001824',
                     'MONDO_0002520',
                     'MONDO_0009823',
                     'MONDO_0017132', 'MONDO_0001824']
siRNA_disease

In [None]:
siRNA_disease.to_csv(edge_data_location + 'siRNA-disease.txt', header=None, sep='\t', index=None)

***
### aptamer-protein

* [DrugBank](https://go.drugbank.com/categories/DBCAT001641) 

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001641
aptamer_protein = pd.read_csv(unprocessed_data_location + 'aptamer-protein_DrugBank.txt', sep='\t') 
aptamer_protein

In [None]:
aptamer_protein['PRO']=['PR_000001575', 'PR_000001576', 'PR_000001577', 'PR_000001752']
aptamer_protein['DB ID']=['DB04932', 'DB04932', 'DB04932', 'DB04998']
aptamer_protein

In [None]:
aptamer_protein.to_csv(edge_data_location + 'aptamer-protein.txt', header=None, sep='\t', index=None)

***
### aptamer-disease

* [DrugBank](https://go.drugbank.com/categories/DBCAT001641) 

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT001641
aptamer_disease = pd.read_csv(unprocessed_data_location + 'aptamer-disease_DrugBank.txt', sep='\t') 
aptamer_disease

In [None]:
aptamer_disease=pd.DataFrame(columns=['DB ID', 'MONDO'])
aptamer_disease['DB ID']=['DB04932', 'DB04998', 'DB04998', 'DB04998', 'DB04998']
aptamer_disease['MONDO']=['MONDO_0019514', 'MONDO_0004992', 'MONDO_0002367', 'MONDO_0004643', 'MONDO_0009831']
aptamer_disease

In [None]:
aptamer_disease.to_csv(edge_data_location + 'aptamer-disease.txt', header=None, sep='\t', index=None)

***
### mRNA (vaccines)-disease

* [DrugBank](https://go.drugbank.com/categories/DBCAT005631) 

In [None]:
# copy-paste from https://go.drugbank.com/categories/DBCAT005631
mRNAv_disease = pd.read_csv(unprocessed_data_location + 'mRNAv-disease_DrugBank.txt', sep='\t') 
mRNAv_disease

In [None]:
mRNAv_disease['DB ID']=['DB15654', 'DB15695', 'DB15696' , 'DB16401', 'DB16402', 'DB17088', 'DB17090', 'DB17095']
mRNAv_disease['MONDO']='MONDO_0100096'
mRNAv_disease

In [None]:
mRNAv_disease.to_csv(edge_data_location + 'mRNAv-disease.txt', header=None, sep='\t', index=None)

***
### lncRNA-gene

* [LncExpDB](https://ngdc.cncb.ac.cn/lncexpdb/) <br /> LncExpDB is a comprehensive database for lncRNA expression. It covers expression profiles of lncRNA genes across various biological contexts, predicts potential functional lncRNAs and their interacting partners, and thus provides essential guidance on experimental design.

In [None]:
# Download from https://ngdc.cncb.ac.cn/lncexpdb/interactions --> Download button
lncRNA_gene = pd.read_csv(unprocessed_data_location + 'interaction.txt', sep='\t') 
lncRNA_gene

In [None]:
lncRNA_gene.distance.unique()

In [None]:
lncRNA_gene = lncRNA_gene[lncRNA_gene['lncname'].notna()]
lncRNA_gene['lncname'] = lncRNA_gene.lncname.str.split(',')
lncRNA_gene = lncRNA_gene.explode('lncname')
lncRNA_gene.drop_duplicates(inplace=True)

In [None]:
lncRNA_gene[(lncRNA_gene['distance']!=-1) & (lncRNA_gene['breadth']==1)][['lncname', 'pcgname']].to_csv(edge_data_location + 'lncRNA-gene.txt', header=None, sep='\t', index=None)

***
### riboswitch-protein

* [TBDB](https://tbdb.io/) <br /> 

TBDB contains T-box riboswitch fold prediction, tRNA pairs from host organisms, information regarding T-box riboswitch genetic context, and thermodynamic calculations of putative T-box riboswitch sequences found in nature.

In [None]:
riboswitch_protein = pd.read_csv('https://tbdb.io/database/tbdb.csv', sep=',') 
riboswitch_protein

In [None]:
# For post-processing purposes
riboswitch_protein.unique_name = riboswitch_protein.unique_name+'.html'

riboswitch_protein.downstream_protein = riboswitch_protein.downstream_protein.str.lower()
riboswitch_protein[['unique_name','downstream_protein']].to_csv(edge_data_location +
                                                                'riboswitch-protein.txt', header=None, sep='\t',
                                                                index=None)

In [None]:
# Fix join columns
riboswitch_protein = pd.merge(riboswitch_protein[['unique_name','downstream_protein']], desc_pro_map, left_on=['downstream_protein'], right_on=[0])[['unique_name', 1]]
riboswitch_protein

In [None]:
riboswitch_protein.to_csv(edge_data_location + 'riboswitch-protein.txt', header=None, sep='\t', index=None)

***
### riboswitch-gobp

* [TBDB](https://tbdb.io/) <br /> 

In [None]:
riboswitch_gobp = pd.read_csv('https://tbdb.io/database/tbdb.csv', sep=',') 
riboswitch_gobp.unique_name = riboswitch_gobp.unique_name+'.html'
riboswitch_gobp.protein_desc

In [None]:
# Extract only GO terms
gobp = riboswitch_gobp.protein_desc.str.rpartition('[')[2].str.rpartition(']')[0].str.replace(":", "_")
riboswitch_gobp = pd.concat([riboswitch_gobp, gobp.rename('gobp')], axis=1)
riboswitch_gobp = riboswitch_gobp[riboswitch_gobp.gobp.str.contains("GO", na=False)]
riboswitch_gobp[['unique_name', 'gobp']]

In [None]:
riboswitch_gobp[['unique_name', 'gobp']].to_csv(edge_data_location + 'riboswitch-gobp.txt', header=None, sep='\t', index=None)

***
### ribozyme-gobp

* [Ribocentre](https://www.ribocentre.org/) <br />
Ribocentre is designed to contain comprehensive information of all natural ribozymes.

In [None]:
data_downloader('https://www.ribocentre.org/38dffd70-0f9f-499b-b442-be2f6e91a156', unprocessed_data_location)
ribozyme_gobp = pd.read_excel(unprocessed_data_location + 'Ribocentre - Application.xlsx', header=1) 
ribozyme_gobp

In [None]:
ribozyme_gobp['gobp'] = ['','','GO_0015867', 'GO_0032363', 'GO_0010468', 'GO_0010468', 'GO_0010468', 'GO_2000232',
                         'GO_0010468', 'GO_0010468', 'GO_0003743', '', '', '', '', '', '', '', 'GO_0010468',
                         '', '', '', 'GO_0050790', '', '', '', '', '', 'GO_0050790', '', '', '', '', '']
ribozyme_gobp

In [None]:
ribozyme_gobp[['ribozyme name', 'gobp']].to_csv(edge_data_location + 'ribozyme-gobp.txt', header=None, sep='\t', index=None)

***
### viral RNA-ribozyme

* [ViroidDB](https://viroids.org/) <br />
ViroidDB is the most comprehensive collection of viroid, satellite RNA, retrozyme, and deltavirus genome sequences available on the internet. 

In [None]:
data_downloader('https://viroids.org/db/latest/all.json', unprocessed_data_location)
vRNA_ribozyme = pd.read_json(unprocessed_data_location + 'all.json').T 
vRNA_ribozyme

In [None]:
# Extract ribozymes 
myre = re.compile(r"\n>> .*?\n")
ribozyme = [myre.findall(i) for i in vRNA_ribozyme.ribozymes]
ribozyme = [[j.replace("\n",'').replace(">> ",'') for j in i] for i in ribozyme]

# List of all possible ribozymes (useful for mapping)
a = [i for j in ribozyme for i in j]
set(a)

In [None]:
vRNA_ribozyme = pd.concat([vRNA_ribozyme.reset_index().drop(columns=['index']),
                           pd.Series(ribozyme)], axis=1)
vRNA_ribozyme = vRNA_ribozyme.explode(0)
vRNA_ribozyme[0] = vRNA_ribozyme[0].str.split().str[0]
vRNA_ribozyme[['accession', 0]]

In [None]:
vRNA_ribozyme[['accession', 0]].to_csv(edge_data_location + 'viralRNA-ribozyme.txt', header=None, sep='\t', index=None)

***
### protein-extracellular form

* [Vesciclepedia](http://microvesicles.org/index.html)  <br /> Vesiclepedia is a a manually curated compendium of molecular data (lipid, RNA and protein) identified in different classes of extracellular vesicles. 

In [None]:
data_downloader('http://microvesicles.org/Archive/VESICLEPEDIA_PROTEIN_MRNA_DETAILS_4.1.txt', unprocessed_data_location)
protein_ev = pd.read_csv(unprocessed_data_location+'VESICLEPEDIA_PROTEIN_MRNA_DETAILS_4.1.txt', sep='\t')
protein_ev = protein_ev[(protein_ev['CONTENT TYPE'] == 'protein') & (protein_ev['SPECIES'].str.contains('apiens'))]
protein_ev

In [None]:
data_downloader('http://microvesicles.org/Archive/VESICLEPEDIA_EXPERIMENT_DETAILS_4.1.txt', unprocessed_data_location)
protein_ev_exp = pd.read_csv(unprocessed_data_location+'VESICLEPEDIA_EXPERIMENT_DETAILS_4.1.txt', sep='\t')
protein_ev_exp

In [None]:
protein_ev = pd.merge(protein_ev[['GENE SYMBOL','EXPERIMENT ID']], protein_ev_exp[['EXPERIMENT ID','VESICLE TYPE']],
                 left_on=['EXPERIMENT ID'], right_on=['EXPERIMENT ID'])[['GENE SYMBOL', 'VESICLE TYPE']]
protein_ev['VESICLE TYPE'].unique()

In [None]:
protein_ev['gocc'] = protein_ev['VESICLE TYPE'].str.replace(r".*xosomes.*", "GO_0070062", regex=True)
protein_ev['gocc'] = protein_ev['gocc'].str.replace("Membrane blebs", "GO_0032059")
protein_ev['gocc'] = protein_ev['gocc'].str.replace("Apoptotic bodies", "GO_0097189")
protein_ev['gocc'] = protein_ev['gocc'].str.replace(r".*embrane", "GO_0016020", regex=True)
protein_ev['gocc'] = protein_ev['gocc'].str.replace(r".*icrovesicles", "GO_1990742", regex=True)
protein_ev['gocc'] = protein_ev['gocc'].str.replace(r"^(?!GO_.*)", "GO_1990742", regex=True)
protein_ev['gocc'] = 'GO_'+protein_ev['gocc'].str.extract('(\d+)', expand=False)
protein_ev

In [None]:
protein_ev.to_csv(edge_data_location + 'protein-gocc.txt', header=None, sep='\t', index=None)

***
### miRNA-extracellular form

* [Vesciclepedia](http://microvesicles.org/index.html)

In [None]:
data_downloader('http://microvesicles.org/Archive/VESICLEPEDIA_MIRNA_DETAILS_4.1.txt', unprocessed_data_location)
miRNA_ev = pd.read_csv(unprocessed_data_location+'VESICLEPEDIA_MIRNA_DETAILS_4.1.txt', sep='\t')
miRNA_ev = miRNA_ev[miRNA_ev['SPECIES'].str.contains('apiens')]
miRNA_ev

In [None]:
miRNA_ev = pd.merge(miRNA_ev[['MIRNA ID','EXPERIMENT ID']], protein_ev_exp[['EXPERIMENT ID','VESICLE TYPE']],
                 left_on=['EXPERIMENT ID'], right_on=['EXPERIMENT ID'])[['MIRNA ID', 'VESICLE TYPE']]
miRNA_ev['VESICLE TYPE'].unique()

In [None]:
miRNA_ev['gocc'] = miRNA_ev['VESICLE TYPE'].str.replace("Exosomes", "GO_0070062")
miRNA_ev['gocc'] = miRNA_ev['gocc'].str.replace("Microvesicles", "GO_1990742")
miRNA_ev['gocc'] = miRNA_ev['gocc'].str.replace("Extracellular vesicles", "GO_1990742")
miRNA_ev['gocc'] = 'GO_'+miRNA_ev['gocc'].str.extract('(\d+)', expand=False)
miRNA_ev

In [None]:
miRNA_ev.to_csv(edge_data_location + 'miRNA-gocc.txt', header=None, sep='\t', index=None)

***
### mRNA-extracellular form

* [Vesciclepedia](http://microvesicles.org/index.html)

In [None]:
mRNA_ev = pd.read_csv(unprocessed_data_location+'VESICLEPEDIA_PROTEIN_MRNA_DETAILS_4.1.txt', sep='\t')
mRNA_ev = mRNA_ev[(mRNA_ev['CONTENT TYPE'] == 'mRNA') & (mRNA_ev['SPECIES'].str.contains('apiens'))]
mRNA_ev

In [None]:
mRNA_ev = pd.merge(mRNA_ev[['ENTREZ GENE ID','EXPERIMENT ID']], protein_ev_exp[['EXPERIMENT ID','VESICLE TYPE']],
                 left_on=['EXPERIMENT ID'], right_on=['EXPERIMENT ID'])[['ENTREZ GENE ID', 'VESICLE TYPE']]
mRNA_ev['VESICLE TYPE'].unique()

In [None]:
mRNA_ev

In [None]:
mRNA_ev['gocc'] = mRNA_ev['VESICLE TYPE'].str.replace(r".*xosomes.*", "GO_0070062", regex=True)
mRNA_ev['gocc'] = mRNA_ev['gocc'].str.replace("Microvesicles", "GO_1990742")
mRNA_ev

In [None]:
mRNA_ev.to_csv(edge_data_location + 'mRNA-gocc.txt', header=None, sep='\t', index=None)

***
### circRNA-extracellular form

* [miRandola](http://mirandola.iit.cnr.it/) <br /> miRandola is a comprehensive manually curated classification of different extracellular circulating non-coding RNA types.

In [None]:
circRNA_ev = pd.read_csv('http://mirandola.iit.cnr.it/download/miRandola_version_02_2017.txt', sep='\t')
circRNA_ev = circRNA_ev[(circRNA_ev['RNA_class'] == 'circRNA') & (circRNA_ev['organism'].str.contains('apiens'))]
circRNA_ev

In [None]:
# circRNA in miRandola only circulates in blood
circRNA_ev['gocc'] = 'GO_0072562'

In [None]:
circRNA_ev.to_csv(edge_data_location + 'circRNA-gocc.txt', header=None, sep='\t', index=None)

***
### circRNA-miRNA

* [SomamiR DB](https://compbio.uthsc.edu/SomamiR/) <br /> SomamiR is a database of cancer somatic mutations in microRNAs (miRNA) and their target sites that potentially alter the interactions between miRNAs and competing endogenous RNAs (ceRNA) including mRNAs, circular RNAs (circRNA) and long noncoding RNAs (lncRNA). It also provides an integrated platform for the functional analysis of these somatic mutations.

In [None]:
data_downloader('https://compbio.uthsc.edu/SomamiR/download/circRNA_somatic_v2.0.txt.tar.gz',
                unprocessed_data_location)
 
circRNA_miRNA = pd.read_csv(unprocessed_data_location + 'circRNA_somatic_v2.0.txt', sep="\t", header=None, skiprows=[0])
circRNA_miRNA

In [None]:
circRNA_miRNA.to_csv(edge_data_location + 'circRNA-miRNA.txt', header=None, sep='\t', index=None)

***
### Remove unprocessed raw data

In [None]:
shutil.rmtree(unprocessed_data_location)

## Non-ontology data

In [None]:
# Provided by PKL ecosystem
data_downloader(processed_url+'subclass_construction_map.pkl', '../resources/construction_approach/')

# Load data, print row count, and preview it
nonO_data = pd.read_pickle(r'../resources/construction_approach/'+'subclass_construction_map.pkl')

# For instance, ncbi IDs are mapped to appropriate SO Ontology entries
list(nonO_data.items())[:5]

***
### miRNA sequences

In [None]:
mature_mirna = mirna_mirbase_map[mirna_mirbase_map[0].str.startswith('MIMAT')]
mature_mirna['SO'] = [['SO_0000276']] * len(mature_mirna)

pre_mirna = mirna_mirbase_map[~mirna_mirbase_map[0].str.startswith('MIMAT')]
pre_mirna['SO'] = [['SO_0000647']] * len(pre_mirna)

mirna_mirbase_map = pd.concat([mature_mirna, pre_mirna])

mirna_nonO = mirna_mirbase_map.drop(1, axis=1).set_index(0).to_dict()
nonO_data = {**nonO_data, **mirna_nonO['SO']}

***
### miRNA sequences

In [None]:
tsRNAnonO_data = tsRNA_tRF_map[[1]]
tsRNAnonO_data['SO']=[['SO_0000253']]*len(tsRNA_tRF_map)

tsRNAnonO_data = tsRNAnonO_data.set_index(1).to_dict()
nonO_data = {**nonO_data, **tsRNAnonO_data['SO']}

***
### ASO sequences

In [None]:
ASO_gene = ASO_gene[['DB ID']]
ASO_gene['SO'] = [['SO_0000644']] * len(ASO_gene)
ASO_gene

In [None]:
ASOnonO_data = ASO_gene.set_index('DB ID').to_dict()
nonO_data = {**nonO_data, **ASOnonO_data['SO']}

***
### gRNA sequences

In [None]:
gRNA_gene = gRNA_gene[[3]]
gRNA_gene['SO']=[['SO_0000602']]*len(gRNA_gene)

gRNA_nonO = gRNA_gene.set_index(3).to_dict()
nonO_data = {**nonO_data, **gRNA_nonO['SO']}

In [None]:
with open('../resources/construction_approach/'+'subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

***
### siRNA sequences

In [None]:
siRNA_gene = siRNA_gene[['DB ID']]
siRNA_gene['SO'] = [['SO_0000646']] * len(siRNA_gene)
siRNA_gene

siRNAnonO_data = siRNA_gene.set_index('DB ID').to_dict()
nonO_data = {**nonO_data, **siRNAnonO_data['SO']}

In [None]:
with open('../resources/construction_approach/'+'subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

***
### Aptamer sequences

In [None]:
aptamer_protein = aptamer_protein[['DB ID']]
aptamer_protein['SO'] = [['SO_0000033']] * len(aptamer_protein)
aptamer_protein

aptamernonO_data = aptamer_protein.set_index('DB ID').to_dict()
nonO_data = {**nonO_data, **aptamernonO_data['SO']}

In [None]:
with open('../resources/construction_approach/'+'subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

***
### mRNA vaccines sequences

In [None]:
mRNAv_disease = mRNAv_disease[['DB ID']]
mRNAv_disease['VO'] = [['VO_0000186']] * len(mRNAv_disease)
mRNAv_disease

mRNAvnonO_data = mRNAv_disease.set_index('DB ID').to_dict()
nonO_data = {**nonO_data, **mRNAvnonO_data['VO']}

In [None]:
with open('../resources/construction_approach/'+'subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

***
### Riboswitch sequences

In [None]:
riboswitch_protein = riboswitch_protein[['unique_name']]
riboswitch_protein['SO'] = [['SO_0000035']] * len(riboswitch_protein)
riboswitch_protein

riboswitchnonO_data = riboswitch_protein.set_index('unique_name').to_dict()
nonO_data = {**nonO_data, **riboswitchnonO_data['SO']}

In [None]:
with open('../resources/construction_approach/'+'subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

***
### Ribozyme sequences

In [None]:
ribozyme_rfam_map['SO'] = [['SO_0000374']] * len(ribozyme_rfam_map[[1]])
ribozyme_rfam_map

ribozymenonO_data = ribozyme_rfam_map.set_index(1).to_dict()
nonO_data = {**nonO_data, **ribozymenonO_data['SO']}

In [None]:
with open('../resources/construction_approach/'+'subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

***
### Viral RNA sequences

In [None]:
vRNA_ribozyme.moleculeType.unique()

In [None]:
ssRNA = vRNA_ribozyme[['accession', 'moleculeType']].loc[vRNA_ribozyme['moleculeType'] == 'ssRNA']
ssRNA['SO'] = [['SO_0001199']] * len(ssRNA)
ssRNA

In [None]:
ssRNAnonO_data = ssRNA.set_index('accession').to_dict()
nonO_data = {**nonO_data, **ssRNAnonO_data['SO']}

In [None]:
with open('../resources/construction_approach/'+'subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
ssRNAm = vRNA_ribozyme[['accession', 'moleculeType']].loc[vRNA_ribozyme['moleculeType'] == 'ssRNA(-)']
ssRNAm['SO'] = [['SO_0001200']] * len(ssRNAm)
ssRNAmnonO_data = ssRNAm.set_index('accession').to_dict()
nonO_data = {**nonO_data, **ssRNAmnonO_data['SO']}
with open('../resources/construction_approach/'+'subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    
    
dsRNA = vRNA_ribozyme[['accession', 'moleculeType']].loc[vRNA_ribozyme['moleculeType'] == 'RNA']
dsRNA['SO'] = [['SO_0001169']] * len(dsRNA)
dsRNAnonO_data = dsRNA.set_index('accession').to_dict()
nonO_data = {**nonO_data, **dsRNAnonO_data['SO']}
with open('../resources/construction_approach/'+'subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    
    
viralRNA = vRNA_ribozyme[['accession', 'moleculeType']].loc[vRNA_ribozyme['moleculeType'].isna()]
viralRNA['SO'] = [['SO_0001041']] * len(viralRNA)
viralRNAnonO_data = viralRNA.set_index('accession').to_dict()
nonO_data = {**nonO_data, **viralRNAnonO_data['SO']}
with open('../resources/construction_approach/'+'subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

***
### siRNA sequences

In [None]:
ICBPsiRNA['SO'] = [['SO_0000646']] * len(ICBPsiRNA)
ICBPsiRNA

In [None]:
siRNAnonO_data = ICBPsiRNA.set_index('ID#').to_dict()
nonO_data = {**nonO_data, **siRNAnonO_data['SO']}

In [None]:
with open('../resources/construction_approach/'+'subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

***
### shRNA sequences

In [None]:
ICBPshRNA['SO'] = [['SO_0002031']] * len(ICBPshRNA)
ICBPshRNA

In [None]:
shRNAnonO_data = ICBPshRNA.set_index('ID#').to_dict()
nonO_data = {**nonO_data, **shRNAnonO_data['SO']}

In [None]:
with open('../resources/construction_approach/'+'subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

***
### Biological roles in ChEBI

In [None]:
bio_role = pd.DataFrame(columns = ["role", "ChEBI"])
bio_role['role'] = ['General', 'Tumor-Suppressor-Gene', 'Oncogene']
bio_role['ChEBI'] = [['CHEBI_24432']] * 3
bio_role

In [None]:
role_nonO_data = bio_role.set_index('role').to_dict()
nonO_data = {**nonO_data, **role_nonO_data['ChEBI']}

In [None]:
with open('../resources/construction_approach/'+'subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# DO NOT RUN, thi cell is only intended to CHECK everything's OK
nonO_data = pd.read_pickle(r'../resources/construction_approach/'+'subclass_construction_map.pkl')

nonO_data.items()