# <p style="text-align: center;">RNA non-ontological entities</p>
    
***
***

**Authors:** [ECavalleri](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=emanuele.cavalleri@unimi.it), [TJCallahan](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=callahantiff@gmail.com)

**GitHub Repositories:** [RNA-KG](https://github.com/AnacletoLAB/RNA-KG/), [PheKnowLator](https://github.com/callahantiff/PheKnowLator/)  
<!--- **Release:** **[v2.0.0](https://github.com/callahantiff/PheKnowLator/wiki/v2.0.0)** --->
  
<br>  
  
**Purpose:** Non-ontological entities' identifiers **must** be added to PheKnowLator's `subclass_construction_map.pkl`. Proper classes have to be chosen and entities linked as their subClass(es).

<br>

**Assumptions:**   
- [RNA-KG_preparation.ipynb](https://github.com/AnacletoLAB/RNA-KG/blob/main/notebooks/RNA-KG_Preparation.ipynb) notebook has already been run.
- [inteRNA-KG_preparation.ipynb](https://github.com/AnacletoLAB/RNA-KG/blob/main/notebooks/inteRNA-KG_Preparation.ipynb) notebook has already been run.

<br>

**Dependencies:**   
- **Scripts**: This notebook utilizes several helper functions, which are stored in the [`data_utils.py`](https://github.com/callahantiff/PheKnowLator/blob/master/pkt_kg/utils/data_utils.py) and [`kg_utils.py`](https://github.com/callahantiff/PheKnowLator/blob/master/pkt_kg/utils/kg_utils.py) scripts.  
- **Data**: All downloaded and generated data sources are provided through [10.5281/zenodo.10078876](https://zenodo.org/doi/10.5281/zenodo.10078876) dedicated repository. <u>This notebook will download everything that is needed for you</u>.  
_____
***

In [None]:
%%capture
import sys
!{sys.executable} -m pip install -r requirements.txt
sys.path.append('../')
# import needed libraries
import datetime
import glob
import itertools
import networkx
import numpy
import os
import pickle
import re
import requests
import tarfile
import shutil
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import re

from collections import Counter
from functools import reduce
from rdflib import Graph, Namespace, URIRef, BNode, Literal
from rdflib.namespace import OWL, RDF, RDFS
from reactome2py import content
from tqdm import tqdm
from typing import Dict

from pkt_kg.utils import * 
from builds.ontology_cleaning import *

from typing import Tuple

In [None]:
df = pd.DataFrame()
    
for file in os.listdir('../resources/edge_data/'):
    if file.endswith('.txt'): # remove '.ipynb_checkpoints' from list
        parts = file.split("-")
        z, w = parts[0], parts[1].replace('.txt','')
        df = pd.concat([df, pd.DataFrame({'Interactor1 ID': [z], 'Interactor2 ID': [w]})])

df['Relation (RO ID)'] = df['Interactor2 ID'].str.extract(r'(\d+)$')
df['Interactor2 ID'] = df['Interactor2 ID'].str.replace(r'(\d+)$', '', regex=True)

a = set(df['Interactor1 ID'])
b = set(df['Interactor2 ID'])
print(sorted(a.union(b)))

In [None]:
# Provided by PKT ecosystem
data_downloader('https://storage.googleapis.com/pheknowlator/current_build/data/processed_data/subclass_construction_map.pkl',
                '../resources/construction_approach/')

# Load data, print row count, and preview it
nonO_data = pd.read_pickle(r'../resources/construction_approach/'+'subclass_construction_map.pkl')

# For instance, ncbi IDs are mapped to appropriate SO Ontology entries
list(nonO_data.items())[:5]

***
### ASO sequences

In [None]:
ASOnonO_data = pd.read_csv('../resources/edge_data/ASO-mRNA3002.txt',sep='\t')['Oligo name in literature'].drop_duplicates()
ASOnonO_data = pd.DataFrame(ASOnonO_data)
ASOnonO_data['SO'] = [['SO_0000644']] * len(ASOnonO_data)
ASOnonO_data = ASOnonO_data.set_index('Oligo name in literature').to_dict()
nonO_data = {**nonO_data, **ASOnonO_data['SO']}

In [None]:
list(nonO_data.items())[-5:]

***
### ASO drugs

In [None]:
ASOdnonO_data = pd.concat([pd.read_csv('../resources/edge_data/ASOd-mRNA2430.txt',sep='\t')['DrugBank ID'],
    pd.read_csv('../resources/edge_data/ASOd-disease2606.txt',sep='\t')['DB ID'],
    pd.read_csv('../resources/edge_data/ASOd-protein11007.txt',sep='\t')['DrugBank ID'],
    pd.read_csv('../resources/edge_data/ASOd-protein10002.txt',sep='\t')['DrugBank ID']]).drop_duplicates()

ASOdnonO_data = pd.DataFrame(ASOdnonO_data)
ASOdnonO_data['SO'] = [['CHEBI_76720']] * len(ASOdnonO_data)
ASOdnonO_data = ASOdnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **ASOdnonO_data['SO']}

***
### Aptamer drugs

In [None]:
aptamerdnonO_data = pd.concat([pd.read_csv('../resources/edge_data/aptamerd-protein2436.txt',sep='\t')['DrugBank ID'],
    pd.read_csv('../resources/edge_data/aptamerd-disease2606.txt',sep='\t')['DrugBank ID']]).drop_duplicates()

aptamerdnonO_data = pd.DataFrame(aptamerdnonO_data)
aptamerdnonO_data['SO'] = [['CHEBI_140490']] * len(aptamerdnonO_data)
aptamerdnonO_data = aptamerdnonO_data.set_index('DrugBank ID').to_dict()
nonO_data = {**nonO_data, **aptamerdnonO_data['SO']}

***
### Aptamer sequences

In [None]:
aptamernonO_data = pd.concat([pd.read_csv('../resources/edge_data/aptamer-protein2436.txt',sep='\t')['Aptamer'],
    pd.read_csv('../resources/edge_data/aptamer-chemical2436.txt',sep='\t')['Aptamer']]).drop_duplicates()

aptamernonO_data = pd.DataFrame(aptamernonO_data)
aptamernonO_data['SO'] = [['CHEBI_140490']] * len(aptamernonO_data)
aptamernonO_data = aptamernonO_data.set_index('Aptamer').to_dict()
nonO_data = {**nonO_data, **aptamernonO_data['SO']}

***
### circRNA sequences

In [None]:
circRNAnonO_data = pd.concat([pd.read_csv('../resources/edge_data/circRNA-disease3302.txt',sep='\t')['circRNA'],
    pd.read_csv('../resources/edge_data/circRNA-gocc1018.txt',sep='\t')['circRNA'],
    pd.read_csv('../resources/edge_data/circRNA-miRNA2434.txt',sep='\t')['circRNA'],
    pd.read_csv('../resources/edge_data/circRNA-premiRNA2434.txt',sep='\t')['circRNA'],
    pd.read_csv('../resources/edge_data/circRNA-protein2434.txt',sep='\t')['circRNA'],
    pd.read_csv('../resources/edge_data/circRNA-RBP2434.txt',sep='\t')['circRNA'],
    pd.read_csv('../resources/edge_data/circRNA-subCellularLocalization1025.txt',sep='\t')['circRNA'],
    pd.read_csv('../resources/edge_data/circRNA-TF2434.txt',sep='\t')['circRNA']]).drop_duplicates()

circRNAnonO_data = pd.DataFrame(circRNAnonO_data)
circRNAnonO_data['SO'] = [['SO_0002291']] * len(circRNAnonO_data)
circRNAnonO_data = circRNAnonO_data.set_index('circRNA').to_dict()
nonO_data = {**nonO_data, **circRNAnonO_data['SO']}

***
### gRNA sequences

In [None]:
gRNAnonO_data = pd.read_csv('../resources/edge_data/gRNA-gene11007.txt',sep='\t')['Plasmid ID'].drop_duplicates()

gRNAnonO_data = pd.DataFrame(gRNAnonO_data)
gRNAnonO_data['SO'] = [['SO_0000602']] * len(gRNAnonO_data)
gRNAnonO_data = gRNAnonO_data.set_index('Plasmid ID').to_dict()
nonO_data = {**nonO_data, **gRNAnonO_data['SO']}

***
### lncRNA sequences

In [None]:
lncRNAnonO_data = pd.concat([
    pd.read_csv('../resources/edge_data/lncRNA-go1025.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-go2263.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-go2327.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-go2331.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-go2432.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-go4033.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-goBFO50.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-histoneModification2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-biologicalContext2245.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-rRNA2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-mRNA2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-biologicalContext2291.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-TF2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-lncRNA2434.txt',sep='\t')['lncRNA1'],
    pd.read_csv('../resources/edge_data/lncRNA-lncRNA2434.txt',sep='\t')['lncRNA2'],
    pd.read_csv('../resources/edge_data/lncRNA-ribozyme2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-subCellularLocalization1025.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-pDeath56.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-pseudogene2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-protein2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-viralProtein2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-chemical2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-disease3302.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-gocc1018.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-ncRNA2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-gobp56.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-biologicalContext2246.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-gene2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-viralmiRNA2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-role2260.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-RBP2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-cell1025.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-scaRNA2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-viralmRNA2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-pw56.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-anatomy1025.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/premiRNA-lncRNA2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/piRNA-lncRNA2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/miRNA-lncRNA2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/PCG-lncRNA2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/othersRNA-lncRNA2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-lncRNA2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/smallProtein-lncRNA2204.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/tRF-lncRNA2434.txt',sep='\t')['lncRNA'],
    pd.read_csv('../resources/edge_data/tRNA-lncRNA2434.txt',sep='\t')['lncRNA']]).drop_duplicates()

lncRNAnonO_data = pd.DataFrame(lncRNAnonO_data)
lncRNAnonO_data['SO'] = [['SO_0001877']] * len(lncRNAnonO_data)
lncRNAnonO_data = lncRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **lncRNAnonO_data['SO']}

***
### miRNA sequences

In [None]:
mirna_mirbase_map = pd.read_csv('../resources/processed_data/MIRNA_MIRBASE_MAP.txt', header=None, names=[1,0], sep='\t')

mature_mirna = mirna_mirbase_map[mirna_mirbase_map[0].str.startswith('MIMAT')]
mature_mirna['SO'] = [['SO_0000276']] * len(mature_mirna)

pre_mirna = mirna_mirbase_map[~mirna_mirbase_map[0].str.startswith('MIMAT')]
pre_mirna['SO'] = [['SO_0000647']] * len(pre_mirna)

mirna_mirbase_map = pd.concat([mature_mirna, pre_mirna])

mirna_nonO = mirna_mirbase_map.drop(1, axis=1).set_index(0).to_dict()
nonO_data = {**nonO_data, **mirna_nonO['SO']}

***
### mRNA vaccines sequences

In [None]:
mRNAvnonO_data = pd.read_csv('../resources/edge_data/mRNAv-disease2606.txt',sep='\t')['DrugBank ID'].drop_duplicates()

mRNAvnonO_data = pd.DataFrame(mRNAvnonO_data)
mRNAvnonO_data['SO'] = [['VO_0000186']] * len(mRNAvnonO_data)
mRNAvnonO_data = mRNAvnonO_data.set_index('DrugBank ID').to_dict()
nonO_data = {**nonO_data, **mRNAvnonO_data['SO']}

***
### scaRNA sequences

In [None]:
scaRNAnonO_data = pd.concat([
    pd.read_csv('../resources/edge_data/scaRNA-go1025.txt',sep='\t')['scaRNA'],
    pd.read_csv('../resources/edge_data/scaRNA-go2331.txt',sep='\t')['scaRNA'],
    pd.read_csv('../resources/edge_data/scaRNA-TF2434.txt',sep='\t')['scaRNA'],
    pd.read_csv('../resources/edge_data/scaRNA-RBP2434.txt',sep='\t')['scaRNA'],
    pd.read_csv('../resources/edge_data/miRNA-scaRNA2434.txt',sep='\t')['scaRNA'],
    pd.read_csv('../resources/edge_data/scaRNA-epiMod2434.txt',sep='\t')['scaRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-scaRNA2434.txt',sep='\t')['scaRNA'],
    pd.read_csv('../resources/edge_data/scaRNA-subCellularLocalization1025.txt',sep='\t')['scaRNA'],
    pd.read_csv('../resources/edge_data/mRNA-scaRNA2434.txt',sep='\t')['scaRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-scaRNA2434.txt',sep='\t')['scaRNA']]).drop_duplicates()

scaRNAnonO_data = pd.DataFrame(scaRNAnonO_data)
scaRNAnonO_data['SO'] = [['SO_0002095']] * len(scaRNAnonO_data)
scaRNAnonO_data = scaRNAnonO_data.set_index('scaRNA').to_dict()
nonO_data = {**nonO_data, **scaRNAnonO_data['SO']}

***
### scRNA sequences

In [None]:
scRNAnonO_data = pd.concat([
    pd.read_csv('../resources/edge_data/scRNA-goBFO50.txt',sep='\t')['scRNA'],
    pd.read_csv('../resources/edge_data/scRNA-go2331.txt',sep='\t')['scRNA'],
    pd.read_csv('../resources/edge_data/scRNA-disease3302.txt',sep='\t')['scRNA'],
    pd.read_csv('../resources/edge_data/scRNA-viralmiRNA2434.txt',sep='\t')['scRNA'],
    pd.read_csv('../resources/edge_data/scRNA-protein2434.txt',sep='\t')['scRNA'],
    pd.read_csv('../resources/edge_data/miRNA-scRNA2434.txt',sep='\t')['scRNA'],
    pd.read_csv('../resources/edge_data/scRNA-mRNA2434.txt',sep='\t')['scRNA'],
    pd.read_csv('../resources/edge_data/scRNA-RBP2434.txt',sep='\t')['scRNA'],
    pd.read_csv('../resources/edge_data/scRNA-subCellularLocalization1025.txt',sep='\t')['scRNA']]).drop_duplicates()

scRNAnonO_data = pd.DataFrame(scRNAnonO_data)
scRNAnonO_data['SO'] = [['SO_0000013']] * len(scRNAnonO_data)
scRNAnonO_data = scRNAnonO_data.set_index('scRNA').to_dict()
nonO_data = {**nonO_data, **scRNAnonO_data['SO']}

***
### snRNA sequences

In [None]:
snRNAnonO_data = pd.concat([
    pd.read_csv('../resources/edge_data/snRNA-goBFO50.txt',sep='\t')['snRNA'],
    pd.read_csv('../resources/edge_data/snRNA-go2331.txt',sep='\t')['snRNA'],
    pd.read_csv('../resources/edge_data/snRNA-go2327.txt',sep='\t')['snRNA'],
    pd.read_csv('../resources/edge_data/snRNA-mRNA2434.txt',sep='\t')['snRNA'],
    pd.read_csv('../resources/edge_data/snRNA-viralmiRNA2434.txt',sep='\t')['snRNA'],
    pd.read_csv('../resources/edge_data/snRNA-protein2434.txt',sep='\t')['snRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-snRNA2434.txt',sep='\t')['snRNA'],
    pd.read_csv('../resources/edge_data/snRNA-snRNA2434.txt',sep='\t')['snRNA1'],
    pd.read_csv('../resources/edge_data/snRNA-snRNA2434.txt',sep='\t')['snRNA2'],
    pd.read_csv('../resources/edge_data/snRNA-ev1018.txt',sep='\t')['snRNA'],
    pd.read_csv('../resources/edge_data/snRNA-RBP2434.txt',sep='\t')['snRNA'],
    pd.read_csv('../resources/edge_data/snRNA-disease3302.txt',sep='\t')['snRNA'],
    pd.read_csv('../resources/edge_data/miRNA-snRNA2434.txt',sep='\t')['snRNA'],
    pd.read_csv('../resources/edge_data/snRNA-pseudogene2434.txt',sep='\t')['snRNA'],
    pd.read_csv('../resources/edge_data/snRNA-subCellularLocalization1025.txt',sep='\t')['snRNA'],
    pd.read_csv('../resources/edge_data/snRNA-TF2434.txt',sep='\t')['snRNA'],
    pd.read_csv('../resources/edge_data/snRNA-viralmRNA2434.txt',sep='\t')['snRNA']]).drop_duplicates()

snRNAnonO_data = pd.DataFrame(snRNAnonO_data)
snRNAnonO_data['SO'] = [['SO_0000274']] * len(snRNAnonO_data)
snRNAnonO_data = snRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **snRNAnonO_data['SO']}

***
### tRNA sequences

In [None]:
tRNAnonO_data = pd.concat([
    pd.read_csv('../resources/edge_data/tRNA-go2331.txt',sep='\t')['tRNA'],
    pd.read_csv('../resources/edge_data/tRNA-go2327.txt',sep='\t')['tRNA'],
    pd.read_csv('../resources/edge_data/tRNA-subCellularLocalization1025.txt',sep='\t')['tRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-tRNA2434.txt',sep='\t')['tRNA'],
    pd.read_csv('../resources/edge_data/tRNA-mRNA_gtRNA2434.txt',sep='\t')['tRNA'],
    pd.read_csv('../resources/edge_data/tRNA-TF2434.txt',sep='\t')['tRNA'],
    pd.read_csv('../resources/edge_data/tRNA-aminoacid2436.txt',sep='\t')['tRNA'],
    pd.read_csv('../resources/edge_data/tRF-tRNA_MINTbase2202.txt',sep='\t')['tRNA'],
    pd.read_csv('../resources/edge_data/tRF-tRNA_tRFdb2202.txt',sep='\t')['tRNA'],
    pd.read_csv('../resources/edge_data/tRNA-lncRNA2434.txt',sep='\t')['tRNA'],
    pd.read_csv('../resources/edge_data/tRNA-mRNA_NCBI2434.txt',sep='\t')['tRNA'],
    pd.read_csv('../resources/edge_data/tRNA-modification2434.txt',sep='\t')['tRNA']]).drop_duplicates()

tRNAnonO_data = pd.DataFrame(tRNAnonO_data)
tRNAnonO_data['SO'] = [['SO_0000253']] * len(tRNAnonO_data)
tRNAnonO_data = tRNAnonO_data.set_index('tRNA').to_dict()
nonO_data = {**nonO_data, **tRNAnonO_data['SO']}

***
### Retained intron sequences

In [None]:
rinonO_data = pd.read_csv('../resources/edge_data/snoRNA-retainedIntron2434.txt',sep='\t')['Retained intron'].drop_duplicates()

rinonO_data = pd.DataFrame(rinonO_data)
rinonO_data['SO'] = [['SO_0000188']] * len(rinonO_data)
rinonO_data = rinonO_data.set_index('Retained intron').to_dict()
nonO_data = {**nonO_data, **rinonO_data['SO']}

***
### rRNA sequences

In [None]:
rRNAnonO_data = pd.concat([
    pd.read_csv('../resources/edge_data/rRNA-go2327.txt',sep='\t')['rRNA'],
    pd.read_csv('../resources/edge_data/rRNA-go1025.txt',sep='\t')['rRNA'],
    pd.read_csv('../resources/edge_data/pseudogene-rRNA2434.txt',sep='\t')['rRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-rRNA2434.txt',sep='\t')['rRNA'],
    pd.read_csv('../resources/edge_data/othersRNA-rRNA2434.txt',sep='\t')['rRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-rRNA2434.txt',sep='\t')['rRNA'],
    pd.read_csv('../resources/edge_data/rRNA-RBP2434.txt',sep='\t')['rRNA'],
    pd.read_csv('../resources/edge_data/rRNA-rRNA2434.txt',sep='\t')['rRNA1'],
    pd.read_csv('../resources/edge_data/rRNA-rRNA2434.txt',sep='\t')['rRNA2'],
    pd.read_csv('../resources/edge_data/rRNA-subCellularLocalization1025.txt',sep='\t')['rRNA'],
    pd.read_csv('../resources/edge_data/mRNA-rRNA2434.txt',sep='\t')['rRNA'],
    pd.read_csv('../resources/edge_data/rRNA-TF2434.txt',sep='\t')['rRNA']]).drop_duplicates()

rRNAnonO_data = pd.DataFrame(rRNAnonO_data)
rRNAnonO_data['SO'] = [['SO_0000252']] * len(rRNAnonO_data)
rRNAnonO_data = rRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **rRNAnonO_data['SO']}

***
### miscRNA sequences

In [None]:
unknownRNAnonO_data = pd.read_csv('../resources/edge_data/snoRNA-miscRNA2434.txt',sep='\t')['miscRNA'].drop_duplicates()

unknownRNAnonO_data = pd.DataFrame(unknownRNAnonO_data)
unknownRNAnonO_data['SO'] = [['SO_0000356']] * len(unknownRNAnonO_data)
unknownRNAnonO_data = unknownRNAnonO_data.set_index('miscRNA').to_dict()
nonO_data = {**nonO_data, **unknownRNAnonO_data['SO']}

***
### mRNA sequences

In [None]:
mRNAnonO_data = pd.concat([pd.read_csv('../resources/edge_data/mRNA-TF2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/mRNA-viralmiRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/mRNA-histoneModification2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/mRNA-mRNA2434.txt',sep='\t')['mRNA1'],
    pd.read_csv('../resources/edge_data/mRNA-protein2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/mRNA-disease3302.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/mRNA-ncRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/mRNA-RBP2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/mRNA-ev1018.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/mRNA-scaRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/mRNA-viralnsRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/mRNA-subCellularLocalization1025.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/mRNA-rRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/mRNA-chemical2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/mRNA-gene2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/mRNA-cell1025.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/mRNA-anatomy1025.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/pseudogene-mRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/snRNA-mRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-mRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/piRNA-mRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/tRNA-mRNA_gtRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/ASO-mRNA3002.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/mRNA-mRNA2434.txt',sep='\t')['mRNA2'],
    pd.read_csv('../resources/edge_data/tRF-mRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-mRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/miRNA-mRNA11002.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/miRNA-mRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/siRNAd-mRNA2430.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/othersRNA-mRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/shRNA-mRNA2430.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/scRNA-mRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/ASOd-mRNA2430.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/premiRNA-mRNA11002.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/eRNA-mRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/premiRNA-mRNA2434.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/siRNA-mRNA2430.txt',sep='\t')['mRNA'],
    pd.read_csv('../resources/edge_data/tRNA-mRNA_NCBI2434.txt',sep='\t')['mRNA']]).drop_duplicates()

mRNAnonO_data = pd.DataFrame(mRNAnonO_data)
mRNAnonO_data['SO'] = [['SO_0000234']] * len(mRNAnonO_data)
mRNAnonO_data = mRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **mRNAnonO_data['SO']}

***
### TEC sequences

In [None]:
TECnonO_data = pd.read_csv('../resources/edge_data/TEC-chemical2434.txt',sep='\t')['TEC'].drop_duplicates()

TECnonO_data = pd.DataFrame(TECnonO_data)
TECnonO_data['SO'] = [['SO_0002139']] * len(TECnonO_data)
TECnonO_data = TECnonO_data.set_index('TEC').to_dict()
nonO_data = {**nonO_data, **TECnonO_data['SO']}

***
### tsRNA sequences

In [None]:
tsRNAnonO_data = pd.concat([pd.read_csv('../resources/edge_data/miRNA-tsRNA2434.txt',sep='\t')['tsRNA'],
    pd.read_csv('../resources/edge_data/tsRNA-disease3302.txt',sep='\t')['tRF']]).drop_duplicates()

tsRNAnonO_data = pd.DataFrame(tsRNAnonO_data)
tsRNAnonO_data['SO'] = [['SO_0000253']] * len(tsRNAnonO_data)
tsRNAnonO_data = tsRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **tsRNAnonO_data['SO']}

***
### Riboswitch sequences

In [None]:
riboswitchnonO_data = pd.concat([pd.read_csv('../resources/edge_data/riboswitch-bactStrain2434.txt',sep='\t')['Riboswitch'],
    pd.read_csv('../resources/edge_data/riboswitch-gobp56.txt',sep='\t')['Riboswitch'],
    pd.read_csv('../resources/edge_data/riboswitch-protein2529.txt',sep='\t')['Riboswitch']]).drop_duplicates()

riboswitchnonO_data = pd.DataFrame(riboswitchnonO_data)
riboswitchnonO_data['SO'] = [['SO_0000035']] * len(riboswitchnonO_data)
riboswitchnonO_data = riboswitchnonO_data.set_index('Riboswitch').to_dict()
nonO_data = {**nonO_data, **riboswitchnonO_data['SO']}

***
### Ribozyme sequences

In [None]:
ribozymenonO_data = pd.concat([
    pd.read_csv('../resources/edge_data/ribozyme-go2327.txt',sep='\t')['Ribozyme'],
    pd.read_csv('../resources/edge_data/ribozyme-gomf1025.txt',sep='\t')['Ribozyme'],
    pd.read_csv('../resources/edge_data/miRNA-ribozyme2434.txt',sep='\t')['Ribozyme'],
    pd.read_csv('../resources/edge_data/lncRNA-ribozyme2434.txt',sep='\t')['Ribozyme'],
    pd.read_csv('../resources/edge_data/ribozyme-gocc85.txt',sep='\t')['Ribozyme'],
    pd.read_csv('../resources/edge_data/ribozyme-TF2434.txt',sep='\t')['ribozyme'],
    pd.read_csv('../resources/edge_data/viralRNA-ribozyme2526.txt',sep='\t')['Ribozyme'],
    pd.read_csv('../resources/edge_data/ribozyme-gobp56.txt',sep='\t')['Ribozyme'],
    pd.read_csv('../resources/edge_data/ribozyme-RBP2434.txt',sep='\t')['ribozyme'],
    pd.read_csv('../resources/edge_data/ribozyme-protein2434.txt',sep='\t')['Ribozyme']]).drop_duplicates()

ribozymenonO_data = pd.DataFrame(ribozymenonO_data)
ribozymenonO_data['SO'] = [['SO_0000374']] * len(ribozymenonO_data)
ribozymenonO_data = ribozymenonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **ribozymenonO_data['SO']}

***
### Viral RNA sequences

In [None]:
ribozyme_rfam_map = pd.DataFrame(data=[['LC ribozyme','family/RF00011'],
                                 ['hammerhead ribozyme','clan/CL00010'],
                                 ['glmS ribozyme','family/RF00234'],
                                 ['HDV-F-prausnitzii','family/RF02682'],
                                 ['HDV ribozyme','family/RF00094'],
                                 ['HDV_ribozyme','family/RF00094'],
                                 ['Hairpin','family/RF00173'],
                                 ['Hammerhead_1','clan/CL00010'],
                                 ['Hammerhead_HH9','clan/CL00010'],
                                 ['Hammerhead_3','clan/CL00010'],
                                 ['Hammerhead_HH10','clan/CL00010'],
                                 ['Hammerhead_II','clan/CL00010'],
                                 ['Pistol','family/RF02679'],
                                 ['Pistol ribozyme','family/RF02679'],
                                 ['twister ribozyme','clan/CL00120'],
                                 ['Twister-P5','clan/CL00120'],
                                 ['Twister-P3','clan/CL00120'],
                                 ['RNAse P','family/RF00009']#,
                                 #['VS ribozyme',''] absent in RFAM
                                 ])

vRNA_ribozyme = pd.read_json('../resources/processed_data/unprocessed_data/all.json').T 

# Extract ribozymes 
myre = re.compile(r"\n>> .*?\n")
ribozyme = [myre.findall(i) for i in vRNA_ribozyme.ribozymes]
ribozyme = [[j.replace("\n",'').replace(">> ",'') for j in i] for i in ribozyme]

# List of all possible ribozymes (useful for mapping)
a = [i for j in ribozyme for i in j]
set(a)
vRNA_ribozyme = pd.concat([vRNA_ribozyme.reset_index().drop(columns=['index']),
                           pd.Series(ribozyme)], axis=1)
vRNA_ribozyme = vRNA_ribozyme.explode(0)
vRNA_ribozyme[0] = vRNA_ribozyme[0].str.split().str[0]
vRNA_ribozyme.drop(columns=['isolationSource','collectionDate','gc','bioSample','genus','family','identicalSeqs','genBankTitle','displayTitle','length',
                           'sequenceType','nucCompleteness','genotype','segment','publications',
                           'geoLocation','country','usa','submitters','releaseDate','isolate',
                            'genus','family','sequence','structure','type','ribozymes','Cls_ID80',
                            'Cls_ID70','Cls_ID85','Cls_ID75','Cls_ID95','Cls_ID90','sraAccession','submitters','species','host'],
                   inplace=True)
vRNA_ribozyme.insert(0,1,vRNA_ribozyme.pop(0))
vRNA_ribozyme
vRNA_ribozyme = pd.merge(ribozyme_rfam_map,vRNA_ribozyme,left_on=0,right_on=1)
vRNA_ribozyme.drop(columns=[0],inplace=True)
vRNA_ribozyme.insert(1,'accession',vRNA_ribozyme.pop('accession'))
vRNA_ribozyme['Source(s)'] = 'ViroidDB'
vRNA_ribozyme = vRNA_ribozyme.drop(columns=['1_y'])

vRNA_ribozyme.moleculeType.unique()

In [None]:
ssRNA = vRNA_ribozyme[['accession', 'moleculeType']].loc[vRNA_ribozyme['moleculeType'] == 'ssRNA']
ssRNA['SO'] = [['SO_0001199']] * len(ssRNA)
ssRNA

In [None]:
ssRNAnonO_data = ssRNA.drop(columns=['moleculeType']).set_index('accession').to_dict()
nonO_data = {**nonO_data, **ssRNAnonO_data['SO']}

In [None]:
ssRNAm = vRNA_ribozyme[['accession', 'moleculeType']].loc[vRNA_ribozyme['moleculeType'] == 'ssRNA(-)']
ssRNAm['SO'] = [['SO_0001200']] * len(ssRNAm)
ssRNAmnonO_data = ssRNAm.drop(columns=['moleculeType']).set_index('accession').to_dict()
nonO_data = {**nonO_data, **ssRNAmnonO_data['SO']}
    
dsRNA = vRNA_ribozyme[['accession', 'moleculeType']].loc[vRNA_ribozyme['moleculeType'] == 'RNA']
dsRNA['SO'] = [['SO_0001169']] * len(dsRNA)
dsRNAnonO_data = dsRNA.drop(columns=['moleculeType']).set_index('accession').to_dict()
nonO_data = {**nonO_data, **dsRNAnonO_data['SO']}
    
viralRNA = vRNA_ribozyme[['accession', 'moleculeType']].loc[vRNA_ribozyme['moleculeType'].isna()]
viralRNA['SO'] = [['SO_0001041']] * len(viralRNA)
viralRNAnonO_data = viralRNA.drop(columns=['moleculeType']).set_index('accession').to_dict()
nonO_data = {**nonO_data, **viralRNAnonO_data['SO']}

***
### siRNA sequences

In [None]:
siRNAnonO_data = pd.read_csv('../resources/edge_data/siRNA-mRNA2430.txt',sep='\t')['siRNA'].drop_duplicates()

siRNAnonO_data = pd.DataFrame(siRNAnonO_data)
siRNAnonO_data['SO'] = [['SO_0000646']] * len(siRNAnonO_data)
siRNAnonO_data = siRNAnonO_data.set_index('siRNA').to_dict()
nonO_data = {**nonO_data, **siRNAnonO_data['SO']}

***
### shRNA sequences

In [None]:
shRNAnonO_data = pd.read_csv('../resources/edge_data/shRNA-mRNA2430.txt',sep='\t')['shRNA'].drop_duplicates()

shRNAnonO_data = pd.DataFrame(shRNAnonO_data)
shRNAnonO_data['SO'] = [['SO_0002031']] * len(shRNAnonO_data)
shRNAnonO_data = shRNAnonO_data.set_index('shRNA').to_dict()
nonO_data = {**nonO_data, **shRNAnonO_data['SO']}

***
### snoRNA sequences

In [None]:
snoRNAnonO_data = pd.concat([
    pd.read_csv('../resources/edge_data/snoRNA-go2327.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-go2331.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-go1025.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-chemical2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-tRNA2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-disease3302.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-viralmiRNA2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-snRNA2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-protein2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-rRNA2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-RBP2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-pseudogene2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-pDeath56.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-mRNA2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-gene2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-lncRNA2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-TF2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-epiMod2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-premiRNA2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-scaRNA2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-retainedIntron2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-miRNA2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-miscRNA2434.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-subCellularLocalization1025.txt',sep='\t')['snoRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-snoRNA2434.txt',sep='\t')['snoRNA1'],
    pd.read_csv('../resources/edge_data/snoRNA-snoRNA2434.txt',sep='\t')['snoRNA2']]).drop_duplicates()

snoRNAnonO_data = pd.DataFrame(snoRNAnonO_data)
snoRNAnonO_data['SO'] = [['SO_0000275']] * len(snoRNAnonO_data)
snoRNAnonO_data = snoRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **snoRNAnonO_data['SO']}

***
### Small proteins

In [None]:
spnonO_data = pd.read_csv('../resources/edge_data/smallProtein-lncRNA2204.txt',sep='\t')['Small protein'].drop_duplicates()

spnonO_data = pd.DataFrame(spnonO_data)
spnonO_data['SO'] = [['SO_0000104']] * len(spnonO_data)
spnonO_data = spnonO_data.set_index('Small protein').to_dict()
nonO_data = {**nonO_data, **spnonO_data['SO']}

***
### siRNA drugs

In [None]:
siRNAdnonO_data = pd.concat([pd.read_csv('../resources/edge_data/siRNAd-mRNA2430.txt',sep='\t')['DrugBank ID'],
    pd.read_csv('../resources/edge_data/siRNAd-disease2606.txt',sep='\t')['DrugBank ID']]).drop_duplicates()

siRNAdnonO_data = pd.DataFrame(siRNAdnonO_data)
siRNAdnonO_data['SO'] = [['SO_0002031', 'CHEBI_23888']] * len(siRNAdnonO_data)
siRNAdnonO_data = siRNAdnonO_data.set_index('DrugBank ID').to_dict()
nonO_data = {**nonO_data, **siRNAdnonO_data['SO']}

***
### Biological roles in ChEBI

In [None]:
bio_role = pd.DataFrame(columns = ["role", "ChEBI"])
bio_role['role'] = ['General', 'Tumor-Suppressor-Gene', 'Oncogene']
bio_role['ChEBI'] = [['CHEBI_24432']] * 3
bio_role

In [None]:
role_nonO_data = bio_role.set_index('role').to_dict()
nonO_data = {**nonO_data, **role_nonO_data['ChEBI']}

***
### Epigenetic modifications in GO

In [None]:
pd.concat([pd.read_csv('../resources/edge_data/miRNA-epiMod2434.txt',sep='\t')['Epigenetic modification'],
           pd.read_csv('../resources/edge_data/premiRNA-epiMod2434.txt',sep='\t')['Epigenetic modification']]).unique()

In [None]:
epiMod = pd.DataFrame(columns = ["mod", "GO"])
epiMod['mod'] = ['H3K4me3', 'H3K9me2', 'H3K9me3', 'H3K27me3', 'H3K4me', 'H3K79me2', 'H3K4me2',
                 'H3K9me', 'H3K27me', 'H3K36me2', 'H3R17me2']
epiMod['GO'] = [['GO_0016571']] * len(epiMod['mod'])
epiMod

In [None]:
go_nonO_data = epiMod.set_index('mod').to_dict()
nonO_data = {**nonO_data, **go_nonO_data['GO']}

In [None]:
epiMod = pd.DataFrame(columns = ["mod", "GO"])
epiMod['mod'] = ['H3S10P']
epiMod['GO'] = [['GO_0006468']]
epiMod

In [None]:
go_nonO_data = epiMod.set_index('mod').to_dict()
nonO_data = {**nonO_data, **go_nonO_data['GO']}

In [None]:
epiMod = pd.DataFrame(columns = ["mod", "GO"])
epiMod['mod'] = ['H3ac', 'H4ac', 'H3K9ac', 'H5ac', 'H3K4ac', 'H3K14ac']
epiMod['GO'] = [['GO_0016573']] * len(epiMod['mod'])
epiMod

In [None]:
go_nonO_data = epiMod.set_index('mod').to_dict()
nonO_data = {**nonO_data, **go_nonO_data['GO']}

***
### Pseudogene sequences

In [None]:
pseudononO_data = pd.concat([pd.read_csv('../resources/edge_data/pseudogene-rRNA2434.txt',sep='\t')['Pseudogene'],
    pd.read_csv('../resources/edge_data/pseudogene-protein2434.txt',sep='\t')['Pseudogene'],
    pd.read_csv('../resources/edge_data/pseudogene-mRNA2434.txt',sep='\t')['Pseudogene'],
    pd.read_csv('../resources/edge_data/pseudo-TF2434.txt',sep='\t')['pseudo'],
    pd.read_csv('../resources/edge_data/pseudo-chemical2434.txt',sep='\t')['pseudo'],
    pd.read_csv('../resources/edge_data/pseudo-disease3302.txt',sep='\t')['pseudo'],
    pd.read_csv('../resources/edge_data/pseudo-RBP2434.txt',sep='\t')['pseudo'],
    pd.read_csv('../resources/edge_data/snoRNA-pseudogene2434.txt',sep='\t')['Pseudogene'],
    pd.read_csv('../resources/edge_data/lncRNA-pseudogene2434.txt',sep='\t')['Pseudogene'],
    pd.read_csv('../resources/edge_data/pseudo-viralmiRNA2434.txt',sep='\t')['pseudo'],
    pd.read_csv('../resources/edge_data/miRNA-pseudogene11002.txt',sep='\t')['Pseudogene'],
    pd.read_csv('../resources/edge_data/pseudo-histoneModification2434.txt',sep='\t')['pseudo'],
    pd.read_csv('../resources/edge_data/pseudogene-pseudogene2434.txt',sep='\t')['Pseudogene1'],
    pd.read_csv('../resources/edge_data/pseudogene-pseudogene2434.txt',sep='\t')['Pseudogene2'],
    pd.read_csv('../resources/edge_data/premiRNA-pseudogene2434.txt',sep='\t')['Pseudogene'],
    pd.read_csv('../resources/edge_data/pseudo-viralmRNA2434.txt',sep='\t')['pseudo'],
    pd.read_csv('../resources/edge_data/tRF-pseudogene2434.txt',sep='\t')['Pseudogene'],
    pd.read_csv('../resources/edge_data/othersRNA-pseudogene2434.txt',sep='\t')['Pseudogene'],
    pd.read_csv('../resources/edge_data/miRNA-pseudogene2434.txt',sep='\t')['Pseudogene'],
    pd.read_csv('../resources/edge_data/snRNA-pseudogene2434.txt',sep='\t')['Pseudogene'],
    pd.read_csv('../resources/edge_data/pseudo-subCellularLocalization1025.txt',sep='\t')['pseudo']]).drop_duplicates()

pseudononO_data = pd.DataFrame(pseudononO_data)
pseudononO_data['SO'] = [['SO_0000336']] * len(pseudononO_data)
pseudononO_data = pseudononO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **pseudononO_data['SO']}

***
### Y_RNA sequences

In [None]:
YnonO_data = pd.read_csv('../resources/edge_data/Y_RNA-subCellularLocalization1025.txt',sep='\t')['Y_RNA'].drop_duplicates()

YnonO_data = pd.DataFrame(YnonO_data)
YnonO_data['SO'] = [['SO_0000405']] * len(YnonO_data)
YnonO_data = YnonO_data.set_index('Y_RNA').to_dict()
nonO_data = {**nonO_data, **YnonO_data['SO']}

***
### eRNA sequences

In [None]:
eRNAnonO_data = pd.read_csv('../resources/edge_data/eRNA-mRNA2434.txt',sep='\t')['eRNA'].drop_duplicates()

eRNAnonO_data = pd.DataFrame(eRNAnonO_data)
eRNAnonO_data['SO'] = [['SO_0000165']] * len(eRNAnonO_data)
eRNAnonO_data = eRNAnonO_data.set_index('eRNA').to_dict()
nonO_data = {**nonO_data, **eRNAnonO_data['SO']}

***
### Histone modifications

In [None]:
hModnonO_data = pd.concat([pd.read_csv('../resources/edge_data/premiRNA-histoneModification2434.txt',sep='\t')['Histone Modification'],
    pd.read_csv('../resources/edge_data/lncRNA-histoneModification2434.txt',sep='\t')['Histone Modification'],
    pd.read_csv('../resources/edge_data/mRNA-histoneModification2434.txt',sep='\t')['Histone Modification'],
    pd.read_csv('../resources/edge_data/pseudo-histoneModification2434.txt',sep='\t')['Histone Modification'],
    pd.read_csv('../resources/edge_data/unknown-histoneModification2434.txt',sep='\t')['Histone Modification'],
    pd.read_csv('../resources/edge_data/others-histoneModification2434.txt',sep='\t')['Histone Modification'],
    pd.read_csv('../resources/edge_data/ncRNA-histoneModification2434.txt',sep='\t')['Histone Modification']]).drop_duplicates()

hModnonO_data = pd.DataFrame(hModnonO_data)
hModnonO_data['SO'] = [['SO_0001700']] * len(hModnonO_data)
hModnonO_data = hModnonO_data.set_index('Histone Modification').to_dict()
nonO_data = {**nonO_data, **hModnonO_data['SO']}

***
### lincRNA sequences

In [None]:
lincRNAnonO_data = pd.read_csv('../resources/edge_data/lincRNA-subCellularLocalization1025.txt',sep='\t')['lincRNA'].drop_duplicates()

lincRNAnonO_data = pd.DataFrame(lincRNAnonO_data)
lincRNAnonO_data['SO'] = [['SO_0001463']] * len(lincRNAnonO_data)
lincRNAnonO_data = lincRNAnonO_data.set_index('lincRNA').to_dict()
nonO_data = {**nonO_data, **lincRNAnonO_data['SO']}

***
### mtRNA sequences

In [None]:
mtRNAnonO_data = pd.read_csv('../resources/edge_data/mtRNA-subCellularLocalization1025.txt',sep='\t')['mtRNA'].drop_duplicates()

mtRNAnonO_data = pd.DataFrame(mtRNAnonO_data)
mtRNAnonO_data['SO'] = [['NCIT_C25975']] * len(mtRNAnonO_data)
mtRNAnonO_data = mtRNAnonO_data.set_index('mtRNA').to_dict()
nonO_data = {**nonO_data, **mtRNAnonO_data['SO']}

***
### ncRNA sequences

In [None]:
ncRNAnonO_data = pd.concat([
    pd.read_csv('../resources/edge_data/ncRNA-go2263.txt',sep='\t')['ncRNA'],
    pd.read_csv('../resources/edge_data/ncRNA-go1025.txt',sep='\t')['ncRNA'],
    pd.read_csv('../resources/edge_data/ncRNA-go2327.txt',sep='\t')['ncRNA'],
    pd.read_csv('../resources/edge_data/ncRNA-go2432.txt',sep='\t')['ncRNA'],
    pd.read_csv('../resources/edge_data/ncRNA-go4033.txt',sep='\t')['ncRNA'],
    pd.read_csv('../resources/edge_data/ncRNA-goBFO50.txt',sep='\t')['ncRNA'],
    pd.read_csv('../resources/edge_data/ncRNA-go2331.txt',sep='\t')['ncRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-ncRNA2434.txt',sep='\t')['ncRNA'],
    pd.read_csv('../resources/edge_data/miRNA-ncRNA2434.txt',sep='\t')['ncRNA'],
    pd.read_csv('../resources/edge_data/mRNA-ncRNA2434.txt',sep='\t')['ncRNA'],
    pd.read_csv('../resources/edge_data/ncRNA-histoneModification2434.txt',sep='\t')['ncRNA'],
    pd.read_csv('../resources/edge_data/ncRNA-protein2434.txt',sep='\t')['ncRNA'],
    pd.read_csv('../resources/edge_data/ncRNA-RBP2434.txt',sep='\t')['ncRNA'],
    pd.read_csv('../resources/edge_data/ncRNA-subCellularLocalization1025.txt',sep='\t')['ncRNA'],
    pd.read_csv('../resources/edge_data/ncRNA-TF2434.txt',sep='\t')['ncRNA']]).drop_duplicates()

ncRNAnonO_data = pd.DataFrame(ncRNAnonO_data)
ncRNAnonO_data['SO'] = [['SO_0000655']] * len(ncRNAnonO_data)
ncRNAnonO_data = ncRNAnonO_data.set_index('ncRNA').to_dict()
nonO_data = {**nonO_data, **ncRNAnonO_data['SO']}

***
### othersRNA sequences

In [None]:
othersRNAnonO_data = pd.concat([pd.read_csv('../resources/edge_data/others-subCellularLocalization1025.txt',sep='\t')['others'],
    pd.read_csv('../resources/edge_data/othersRNA-rRNA2434.txt',sep='\t')['Others RNA'],
    pd.read_csv('../resources/edge_data/othersRNA-lncRNA2434.txt',sep='\t')['Others RNA'],
    pd.read_csv('../resources/edge_data/othersRNA-mRNA2434.txt',sep='\t')['Others RNA'],
    pd.read_csv('../resources/edge_data/miRNA-othersRNA2434.txt',sep='\t')['others RNA'],
    pd.read_csv('../resources/edge_data/other-viralmiRNA2434.txt',sep='\t')['other'],
    pd.read_csv('../resources/edge_data/othersRNA-protein2434.txt',sep='\t')['Others RNA'],
    pd.read_csv('../resources/edge_data/othersRNA-pseudogene2434.txt',sep='\t')['Others RNA'],
    pd.read_csv('../resources/edge_data/others-TF2434.txt',sep='\t')['others'],
    pd.read_csv('../resources/edge_data/others-gene2434.txt',sep='\t')['others'],
    pd.read_csv('../resources/edge_data/others-RBP2434.txt',sep='\t')['others'],
    pd.read_csv('../resources/edge_data/others-histoneModification2434.txt',sep='\t')['others']]).drop_duplicates()

othersRNAnonO_data = pd.DataFrame(othersRNAnonO_data)
othersRNAnonO_data['SO'] = [['SO_0000356']] * len(othersRNAnonO_data)
othersRNAnonO_data = othersRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **othersRNAnonO_data['SO']}

***
### piRNA sequences

In [None]:
piRNAnonO_data = pd.read_csv('../resources/property_data/piRNA.csv')['Name'].drop_duplicates()

piRNAnonO_data = pd.DataFrame(piRNAnonO_data)
piRNAnonO_data['SO'] = [['SO_0001035']] * len(piRNAnonO_data)
piRNAnonO_data = piRNAnonO_data.set_index('Name').to_dict()
nonO_data = {**nonO_data, **piRNAnonO_data['SO']}

***
### sRNA sequences

In [None]:
sRNAnonO_data = pd.read_csv('../resources/edge_data/sRNA-TF2434.txt',sep='\t')['sRNA'].drop_duplicates()

sRNAnonO_data = pd.DataFrame(sRNAnonO_data)
sRNAnonO_data['SO'] = [['SO_0002022']] * len(sRNAnonO_data)
sRNAnonO_data = sRNAnonO_data.set_index('sRNA').to_dict()
nonO_data = {**nonO_data, **sRNAnonO_data['SO']}

***
### Telomerase RNA sequences

In [None]:
teloRNAnonO_data = pd.concat([pd.read_csv('../resources/edge_data/teloRNA-goBFO50.txt',sep='\t')['Telomerase RNA'],
    pd.read_csv('../resources/edge_data/teloRNA-go2331.txt',sep='\t')['Telomerase RNA'],
    pd.read_csv('../resources/edge_data/teloRNA-go2326.txt',sep='\t')['Telomerase RNA'],
    pd.read_csv('../resources/edge_data/teloRNA-go2325.txt',sep='\t')['Telomerase RNA'],
    pd.read_csv('../resources/edge_data/teloRNA-go1025.txt',sep='\t')['Telomerase RNA'],
    pd.read_csv('../resources/edge_data/teloRNA-go2327.txt',sep='\t')['Telomerase RNA']]).drop_duplicates()

teloRNAnonO_data = pd.DataFrame(teloRNAnonO_data)
teloRNAnonO_data['SO'] = [['SO_0000390']] * len(teloRNAnonO_data)
teloRNAnonO_data = teloRNAnonO_data.set_index('Telomerase RNA').to_dict()
nonO_data = {**nonO_data, **teloRNAnonO_data['SO']}

***
### RNAse P sequences

In [None]:
RNAsePnonO_data = pd.concat([
    pd.read_csv('../resources/edge_data/RNAseP-goBFO50.txt',sep='\t')['RNAseP RNA'],
    pd.read_csv('../resources/edge_data/RNAseP-go2331.txt',sep='\t')['RNAseP RNA'],
    pd.read_csv('../resources/edge_data/RNAseP-go2327.txt',sep='\t')['RNAseP RNA']]).drop_duplicates()

RNAsePnonO_data = pd.DataFrame(RNAsePnonO_data)
RNAsePnonO_data['SO'] = [['SO_0000386']] * len(RNAsePnonO_data)
RNAsePnonO_data = RNAsePnonO_data.set_index('RNAseP RNA').to_dict()
nonO_data = {**nonO_data, **RNAsePnonO_data['SO']}

***
### RNAse MRP sequences

In [None]:
RNAseMRPnonO_data = pd.concat([
    pd.read_csv('../resources/edge_data/RNAseMRP-go1025.txt',sep='\t')['RNAseMRP RNA'],
    pd.read_csv('../resources/edge_data/RNAseMRP-go2326.txt',sep='\t')['RNAseMRP RNA'],
    pd.read_csv('../resources/edge_data/RNAseMRP-go2327.txt',sep='\t')['RNAseMRP RNA'],
    pd.read_csv('../resources/edge_data/RNAseMRP-go2331.txt',sep='\t')['RNAseMRP RNA'],
    pd.read_csv('../resources/edge_data/RNAseMRP-goBFO50.txt',sep='\t')['RNAseMRP RNA']]).drop_duplicates()

RNAseMRPnonO_data = pd.DataFrame(RNAseMRPnonO_data)
RNAseMRPnonO_data['SO'] = [['SO_0000385']] * len(RNAseMRPnonO_data)
RNAseMRPnonO_data = RNAseMRPnonO_data.set_index('RNAseMRP RNA').to_dict()
nonO_data = {**nonO_data, **RNAseMRPnonO_data['SO']}

***
### tRF sequences

In [None]:
tRFnonO_data = pd.concat([pd.read_csv('../resources/edge_data/tRF-cellLine1025.txt',sep='\t')['tRF'],
    pd.read_csv('../resources/edge_data/tRF-lncRNA2434.txt',sep='\t')['tRF'],
    pd.read_csv('../resources/edge_data/tRF-mRNA2434.txt',sep='\t')['tRF'],
    pd.read_csv('../resources/edge_data/tRF-pseudogene2434.txt',sep='\t')['tRF'],
    pd.read_csv('../resources/edge_data/tRF-tRNA_MINTbase2202.txt',sep='\t')['tRF'],
    pd.read_csv('../resources/edge_data/tRF-tRNA_tRFdb2202.txt',sep='\t')['tRF']]).drop_duplicates()

tRFnonO_data = pd.DataFrame(tRFnonO_data)
tRFnonO_data['SO'] = [['SO_0001172']] * len(tRFnonO_data)
tRFnonO_data = tRFnonO_data.set_index('tRF').to_dict()
nonO_data = {**nonO_data, **tRFnonO_data['SO']}

***
### unknownRNA sequences

In [None]:
unknownRNAnonO_data = pd.concat([pd.read_csv('../resources/edge_data/unknown-viralmiRNA2434.txt',sep='\t')['unknown'],
    pd.read_csv('../resources/edge_data/unknownRNA-protein2434.txt',sep='\t')['Unknown RNA'],
    pd.read_csv('../resources/edge_data/unknown-histoneModification2434.txt',sep='\t')['unknown'],
    pd.read_csv('../resources/edge_data/unknown-TF2434.txt',sep='\t')['unknown'],
    pd.read_csv('../resources/edge_data/miRNA-unknownRNA2434.txt',sep='\t')['Unknown RNA']]).drop_duplicates()

unknownRNAnonO_data = pd.DataFrame(unknownRNAnonO_data)
unknownRNAnonO_data['SO'] = [['SO_0000356']] * len(unknownRNAnonO_data)
unknownRNAnonO_data = unknownRNAnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **unknownRNAnonO_data['SO']}

***
### vRNA sequences

In [None]:
vRNAnonO_data = pd.read_csv('../resources/edge_data/vRNA-subCellularLocalization1025.txt',sep='\t')['vRNA'].drop_duplicates()

vRNAnonO_data = pd.DataFrame(vRNAnonO_data)
vRNAnonO_data['SO'] = [['SO_0001041']] * len(vRNAnonO_data)
vRNAnonO_data = vRNAnonO_data.set_index('vRNA').to_dict()
nonO_data = {**nonO_data, **vRNAnonO_data['SO']}

***
### viralmiRNA sequences

In [None]:
vRNAnonO_data = pd.concat([pd.read_csv('../resources/edge_data/scRNA-viralmiRNA2434.txt',sep='\t')['Viral miRNA'],
    pd.read_csv('../resources/edge_data/snRNA-viralmiRNA2434.txt',sep='\t')['Viral miRNA'],
    pd.read_csv('../resources/edge_data/mRNA-viralmiRNA2434.txt',sep='\t')['Viral miRNA'],
    pd.read_csv('../resources/edge_data/snoRNA-viralmiRNA2434.txt',sep='\t')['Viral miRNA'],
    pd.read_csv('../resources/edge_data/unknown-viralmiRNA2434.txt',sep='\t')['Viral miRNA'],
    pd.read_csv('../resources/edge_data/pseudo-viralmiRNA2434.txt',sep='\t')['Viral miRNA'],
    pd.read_csv('../resources/edge_data/protein-viralmiRNA2434.txt',sep='\t')['Viral miRNA'],
    pd.read_csv('../resources/edge_data/miRNA-viralmiRNA2434.txt',sep='\t')['Viral miRNA'],
    pd.read_csv('../resources/edge_data/other-viralmiRNA2434.txt',sep='\t')['Viral miRNA'],
    pd.read_csv('../resources/edge_data/lncRNA-viralmiRNA2434.txt',sep='\t')['Viral miRNA'],
    pd.read_csv('../resources/edge_data/premiRNA-viralmiRNA2434.txt',sep='\t')['Viral miRNA']]).drop_duplicates()

vRNAnonO_data = pd.DataFrame(vRNAnonO_data)
vRNAnonO_data['SO'] = [['SO_0001041', 'SO_0000276']] * len(vRNAnonO_data)
vRNAnonO_data = vRNAnonO_data.set_index('Viral miRNA').to_dict()
nonO_data = {**nonO_data, **vRNAnonO_data['SO']}

***
### viralmRNA sequences

In [None]:
viralmRNAnonO_data = pd.concat([pd.read_csv('../resources/edge_data/lncRNA-viralmRNA2434.txt',sep='\t')['Viral mRNA'],
    pd.read_csv('../resources/edge_data/miRNA-viralmRNA2434.txt',sep='\t')['Viral mRNA'],
    pd.read_csv('../resources/edge_data/pseudo-viralmRNA2434.txt',sep='\t')['Viral mRNA'],
    pd.read_csv('../resources/edge_data/snRNA-viralmRNA2434.txt',sep='\t')['Viral mRNA']]).drop_duplicates()

viralmRNAnonO_data = pd.DataFrame(viralmRNAnonO_data)
viralmRNAnonO_data['SO'] = [['SO_0001041']] * len(viralmRNAnonO_data)
viralmRNAnonO_data = viralmRNAnonO_data.set_index('Viral mRNA').to_dict()
nonO_data = {**nonO_data, **viralmRNAnonO_data['SO']}

***
### viralnsRNA sequences

In [None]:
viralnsRNAnonO_data = pd.read_csv('../resources/edge_data/mRNA-viralnsRNA2434.txt',sep='\t')['Viral nsRNA'].drop_duplicates()

viralnsRNAnonO_data = pd.DataFrame(viralnsRNAnonO_data)
viralnsRNAnonO_data['SO'] = [['SO_0001041']] * len(viralnsRNAnonO_data)
viralnsRNAnonO_data = viralnsRNAnonO_data.set_index('Viral nsRNA').to_dict()
nonO_data = {**nonO_data, **viralnsRNAnonO_data['SO']}

***
### viralunRNA sequences

In [None]:
vtRNAsnonO_data = pd.read_csv('../resources/edge_data/miRNA-viralunRNA2434.txt',sep='\t')['Viral unknown RNA'].drop_duplicates()

vtRNAsnonO_data = pd.DataFrame(vtRNAsnonO_data)
vtRNAsnonO_data['SO'] = [['SO_0001041']] * len(vtRNAsnonO_data)
vtRNAsnonO_data = vtRNAsnonO_data.set_index('Viral unknown RNA').to_dict()
nonO_data = {**nonO_data, **vtRNAsnonO_data['SO']}

***
### vtRNAs sequences

In [None]:
vtRNAsnonO_data = pd.read_csv('../resources/edge_data/vtRNAs-protein2434.txt',sep='\t')['vtRNAs'].drop_duplicates()

vtRNAsnonO_data = pd.DataFrame(vtRNAsnonO_data)
vtRNAsnonO_data['SO'] = [['SO_0000404']] * len(vtRNAsnonO_data)
vtRNAsnonO_data = vtRNAsnonO_data.set_index('vtRNAs').to_dict()
nonO_data = {**nonO_data, **vtRNAsnonO_data['SO']}

***
### tRNA-related modifications

In [None]:
tRNAmodnonO_data = pd.read_csv('../resources/edge_data/tRNA-modification2434.txt',sep='\t')['Modification'].drop_duplicates()

tRNAmodnonO_data = pd.DataFrame(tRNAmodnonO_data)
tRNAmodnonO_data['SO'] = [['GO_0140101']] * len(tRNAmodnonO_data)
tRNAmodnonO_data = tRNAmodnonO_data.set_index('Modification').to_dict()
nonO_data = {**nonO_data, **tRNAmodnonO_data['SO']}


***
### Reactome pathways

In [None]:
reactomenonO_data = pd.read_csv('../resources/processed_data/DESC_REACTOME_MAP.txt', header=None, sep='\t')[1]

nonO_data_series = pd.Series(nonO_data)

a = pd.DataFrame(reactomenonO_data)
a.rename(columns={1:'Reactome'}, inplace=True)
b = pd.DataFrame(nonO_data_series)
b['Reactome'] = b.index
c = pd.merge(a,b, on=['Reactome'])

# Adding Reactome Pathways not covered by PKT
SNPnonO_data = a[~a['Reactome'].isin(c['Reactome'])]['Reactome'].unique()

reactomenonO_data = pd.DataFrame(reactomenonO_data)
reactomenonO_data['SO'] = [['PW_0000001']] * len(reactomenonO_data)
reactomenonO_data = reactomenonO_data.set_index(1).to_dict()
nonO_data = {**nonO_data, **reactomenonO_data['SO']}

***
### Wikipathways

In [None]:
wpwnonO_data = pd.read_csv('../resources/processed_data/DESC_WIKIPATHWAYS_MAP.txt', header=None, sep='\t')[1]

wpwnonO_data = pd.DataFrame(wpwnonO_data)
wpwnonO_data['SO'] = [['PW_0000001']] * len(wpwnonO_data)
wpwnonO_data = wpwnonO_data.set_index(1).to_dict()
nonO_data = {**nonO_data, **wpwnonO_data['SO']}

***
### Antisense RNA sequences

In [None]:
asRNAnonO_data = pd.concat([pd.read_csv('../resources/edge_data/antisenseRNA-go2327.txt',sep='\t')['Antisense RNA'],
    pd.read_csv('../resources/edge_data/antisenseRNA-go2331.txt',sep='\t')['Antisense RNA']]).drop_duplicates()

asRNAnonO_data = pd.DataFrame(asRNAnonO_data)
asRNAnonO_data['SO'] = [['SO_0000644']] * len(asRNAnonO_data)
asRNAnonO_data = asRNAnonO_data.set_index('Antisense RNA').to_dict()
nonO_data = {**nonO_data, **asRNAnonO_data['SO']}

***
### Variants (SNPs)

In [None]:
SNPnonO_data = pd.concat([pd.read_csv('../resources/edge_data/variant-piRNA2566.txt',sep='\t').rename(columns={'rs Id':'SNP'})['SNP'],
                          pd.read_csv('../resources/edge_data/variant-miRNA2566.txt',sep='\t')['SNP'],
                          pd.read_csv('../resources/edge_data/variant-premiRNA2566.txt', sep='\t')['SNP'],
                          pd.read_csv('../resources/edge_data/variant-gene2566.txt', sep='\t')['SNP'],
                          pd.read_csv('../resources/edge_data/variant-disease2566.txt', sep='\t')['SNP'],
                          pd.read_csv('../resources/edge_data/variant-TF2566.txt', sep='\t')['SNP']]).drop_duplicates()

nonO_data_series = pd.Series(nonO_data)

a = pd.DataFrame(SNPnonO_data)
b = pd.DataFrame(nonO_data_series)
b['SNP'] = b.index
c = pd.merge(a,b, on=['SNP'])

# Adding SNPs not covered by PKT
SNPnonO_data = a[~a['SNP'].isin(c['SNP'])]['SNP'].unique()

SNPnonO_data = pd.DataFrame(SNPnonO_data)
SNPnonO_data['SO'] = [['SO_0001059']] * len(SNPnonO_data)
SNPnonO_data = SNPnonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **SNPnonO_data['SO']}

***
### Genes

In [None]:
genenonO_data = pd.concat([pd.read_csv('../resources/edge_data/snoRNA-gene2434.txt',sep='\t')['Gene'],
                          pd.read_csv('../resources/edge_data/miRNA-gene2449.txt', sep='\t')['Gene'],
                          pd.read_csv('../resources/edge_data/others-gene2434.txt', sep='\t')['Gene'],
                          pd.read_csv('../resources/edge_data/variant-gene2566.txt', sep='\t')['Gene'],
                          pd.read_csv('../resources/edge_data/gRNA-gene11007.txt', sep='\t')['Gene'],
                          pd.read_csv('../resources/edge_data/mRNA-gene2434.txt', sep='\t')['Gene'],
                          pd.read_csv('../resources/edge_data/miRNA-gene11002.txt', sep='\t')['Gene'],
                          pd.read_csv('../resources/edge_data/lncRNA-gene2434.txt', sep='\t')['Gene'],
                          pd.read_csv('../resources/edge_data/miRNA-gene11013.txt', sep='\t')['Gene'],
                          pd.read_csv('../resources/edge_data/miRNA-gene11016.txt', sep='\t')['Gene'],
                          pd.read_csv('../resources/edge_data/miRNA-gene2450.txt', sep='\t')['Gene']]).drop_duplicates()

nonO_data_series = pd.Series(nonO_data)

a = pd.DataFrame(genenonO_data)
a['Gene'] = a['Gene'].astype('str')
b = pd.DataFrame(nonO_data_series)
b['Gene'] = b.index.astype('str')
c = pd.merge(a,b, on=['Gene'])

# Adding genes not covered by PKT
genenonO_data = a[~a['Gene'].isin(c['Gene'])]['Gene'].unique()

genenonO_data = pd.DataFrame(genenonO_data)
genenonO_data['SO'] = [['SO_0000704']] * len(genenonO_data)
genenonO_data = genenonO_data.set_index(0).to_dict()
nonO_data = {**nonO_data, **genenonO_data['SO']}

***

In [None]:
# Write data
with open('../resources/construction_approach/subclass_construction_map.pkl', 'wb') as handle:
    pickle.dump(nonO_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# DO NOT RUN, this cell is only intended to CHECK everything's OK
#nonO_data = pd.read_pickle(r'../resources/construction_approach/'+'subclass_construction_map.pkl')

#nonO_data.items()

At this point, please run the [<tt>main.ipynb</tt>](https://github.com/AnacletoLAB/RNA-KG/blob/main/main.ipynb) notebook for generating RNA-KG.