# <p style="text-align: center;">RNA-KG node properties and entity linking</p>
    
***
***

**Author:** [ECavalleri](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=emanuele.cavalleri@unimi.it)

**GitHub Repositories:** [RNA-KG](https://github.com/AnacletoLAB/RNA-KG/)
  
<br>  
  
**Purpose:** This notebook serves as a script to add properties to entities within the RNA-centered Knowledge Graph. Entities without a direct corresponce to an ontology class are linked to a proper ontology class via the RDF `subClassOf` predicate.

<br>

**Dependencies:**   
- **Scripts**: This notebook utilizes several helper functions, which are stored in the [`data_utils.py`](https://github.com/callahantiff/PheKnowLator/blob/master/pkt_kg/utils/data_utils.py) and [`kg_utils.py`](https://github.com/callahantiff/PheKnowLator/blob/master/pkt_kg/utils/kg_utils.py) scripts.  
- **Data**: All downloaded and generated data sources are provided through [10.5281/zenodo.10078876](https://zenodo.org/doi/10.5281/zenodo.10078876) dedicated repository. <u>This notebook will download everything that is needed for you</u>.  
_____
***

In [None]:
%%capture
import sys
!{sys.executable} -m pip install -r requirements.txt
sys.path.append('../')

In [None]:
# import needed libraries
import datetime
import glob
import itertools
import networkx
import numpy
import os
import pickle
import re
import requests
import tarfile
import shutil
import concurrent.futures
import pandas as pd
import networkx as nx
import gffpandas.gffpandas as gffpd
import numpy as np
pd.set_option('display.max_columns', None)
import re
from Bio import SeqIO
import json
import ast

from collections import Counter
from functools import reduce
from rdflib import Graph, Namespace, URIRef, BNode, Literal
from rdflib.namespace import OWL, RDF, RDFS
from reactome2py import content
from tqdm import tqdm
from typing import Dict

from pkt_kg.utils import * 
from builds.ontology_cleaning import *

from typing import Tuple

In [None]:
# directory to store resources
resource_data_location = '../resources/'

# directory to use for unprocessed data
unprocessed_data_location = '../resources/processed_data/unprocessed_data/'

# directory to use for unprocessed edge data
unprocessed_edge_data_location = '../resources/processed_data/unprocessed_data/edges/'

# directory to use for unprocessed property data
unprocessed_property_data_location = '../resources/processed_data/unprocessed_data/properties/'

# directory to use for processed data
processed_data_location = '../resources/processed_data/'

# directory to write ontology data to
ontology_data_location = '../resources/ontologies/'

# directory to write edges data to
edge_data_location = '../resources/edge_data/'

# directory to write node properties to
properties_location = '../resources/property_data/'

# processed data url 
processed_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/processed_data/'

# original data url 
original_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/original_data/'

# owltools location
owltools_location = '../pkt_kg/libs/owltools'

***
***

In [None]:
type = []
for filename in os.listdir(unprocessed_edge_data_location):
    if filename.endswith('.pkl'):
        type.append(filename.split('_')[0])
        type.append(filename.split('_')[-1].replace('.pkl', ''))
type = set(type)
type

***
***
# RNA

* [RNAcentral](https://rnacentral.org/) <br/> RNAcentral is a free, public resource that offers integrated access to a comprehensive and up-to-date set of non-coding RNA sequences provided by a collaborating group of Expert Databases representing a broad range of organisms and RNA types.

In [None]:
!wget https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/sequences/rnacentral_species_specific_ids.fasta.gz -O $unprocessed_property_data_location/rnacentral_species_specific_ids.fasta.gz 
!gunzip $unprocessed_property_data_location/rnacentral_species_specific_ids.fasta.gz 

In [None]:
def read_fasta_to_df(fasta_file):
    records = SeqIO.parse(fasta_file, "fasta")   
    data = [{'ID': record.id.split("_")[0], 'Sequence': str(record.seq), 'Description': str(record.description.split(" ",1)[1])}
            for record in records if record.id.split("_")[1] == "9606"] # aggiungere virus
    print(records)
    return pd.DataFrame(data)

df = read_fasta_to_df(unprocessed_property_data_location + "rnacentral_species_specific_ids.fasta")

df.to_csv(unprocessed_property_data_location + 'rnacentral_species_specific_ids_human.csv', index=False)
#df = pd.read_csv(unprocessed_property_data_location + 'rnacentral_species_specific_ids_human.csv')
df['Description'] = df['Description'].str.lower()
df['Sequence'] = df['Sequence'].str.upper()
df.head(n=3)

In [None]:
!wget https://ftp.ebi.ac.uk/pub/databases/RNAcentral/current_release/genome_coordinates/gff3/homo_sapiens.GRCh38.gff3.gz -O $unprocessed_property_data_location/homo_sapiens.GRCh38.gff3.gz
!gunzip $unprocessed_property_data_location/homo_sapiens.GRCh38.gff3.gz

In [None]:
def parse_attributes(attr_str):
    return dict(item.split('=') for item in attr_str.split(';') if '=' in item)

gff_df = gffpd.read_gff3(unprocessed_data_location + 'homo_sapiens.GRCh38.gff3').df
gff_df = gff_df[gff_df['attributes'].str.contains("_9606;")]
gff_df['parsed_attributes'] = gff_df['attributes'].apply(parse_attributes)
gff_df = pd.concat([gff_df.drop(columns=['attributes']), gff_df['parsed_attributes'].apply(pd.Series)], axis=1)
gff_df.to_csv(unprocessed_property_data_location + "gffrnacentral.csv", index=False)
#gff_df = pd.read_csv(unprocessed_property_data_location + "gffrnacentral.csv")
gff_df['Name'] = gff_df['Name'].str.split("_").str[0]
gff_df['Genomic_location'] = "chr" + gff_df['seq_id'].astype(str) + ":" + gff_df['start'].astype(str) +\
    "-" + gff_df['end'].astype(str) + gff_df['strand']
gff_df.head(n=3)

In [None]:
rnacentral_properties = pd.merge(df, gff_df, left_on='ID', right_on='Name', how='outer')
rnacentral_properties['Name'] = rnacentral_properties['Name'].fillna(rnacentral_properties['ID_x'])
rnacentral_properties = rnacentral_properties[['Name', 'Sequence', 'Description', 'description',
                                               'type.1', 'source', 'databases', 'Genomic_location']]
rnacentral_properties.head(n=2)

In [None]:
rnacentral_map = pd.read_csv(unprocessed_data_location + "id_mapping.tsv", delimiter='\t',
                             names=['ID', 'DB', 'DB ID', 'Organism', 'RNA category', 'Label'])
rnacentral_map_human = rnacentral_map[rnacentral_map['Organism'] == 9606]
rnacentral_map_human.head(n=3)

In [None]:
rnacentral_properties = pd.merge(rnacentral_properties, rnacentral_map_human, left_on='Name', right_on='ID', how='right')
rnacentral_properties['Name'] = rnacentral_properties['Name'].fillna(rnacentral_properties['ID'])
rnacentral_properties['description'] = rnacentral_properties['description'].fillna(rnacentral_properties['Label'])
rnacentral_properties.head(n=2)

In [None]:
rnacentral_properties['Category'] = rnacentral_properties['type.1'] + "," + rnacentral_properties['RNA category']
rnacentral_properties['Category'] = rnacentral_properties['Category'].str.split(",")
rnacentral_properties = rnacentral_properties.explode('Category')
rnacentral_properties = rnacentral_properties.drop(
    columns=['type.1', 'source','databases','ID','DB','DB ID','Organism','RNA category','Label']).drop_duplicates()
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace(np.nan, 'ncRNA')

rnacentral_el = rnacentral_properties.copy()
rnacentral_el[['Name','Category']].drop_duplicates().to_pickle(unprocessed_property_data_location + 'rnacentral_el.pkl')

rnacentral_properties.head(n=3)

In [None]:
rnacentral_properties = rnacentral_properties.groupby('Name').agg({
    'Category': set,
    'Description': lambda x: x.dropna().iloc[0] if not x.dropna().empty else np.nan,
    'description': lambda x: x.dropna().iloc[0] if not x.dropna().empty else np.nan,
    'Genomic_location': lambda x: set(x.dropna()) if x.dropna().any() else set(),
    'Sequence': lambda x: x.dropna().iloc[0] if not x.dropna().empty else np.nan
}).reset_index()
rnacentral_properties.head(n=3)

In [None]:
rnacentral_properties.Category = rnacentral_properties.Category.astype(str)
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('{\'', '', regex=True)
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('\'}', '', regex=True)
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('\'', '', regex=True)
rnacentral_properties.Category.unique()

In [None]:
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('lncRNA', 'RNA, ncRNA, lncRNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('antisense_RNA', 'RNA, ncRNA, antisense_RNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('precursor_RNA', 'RNA, precursor_RNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('guide_RNA', 'RNA, ncRNA, guide_RNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('autocatalytically_spliced_intron',
                                                                              'RNA, ncRNA, intron, autocatalytically_spliced_intron')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('RNase_MRP_RNA',  'RNA, ncRNA, enzymatic_RNA, RNase_MRP_RNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('tmRNA', 'RNA, ncRNA, sncRNA, small_regulatory_ncRNA, tmRNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('other', 'RNA, ncRNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('circRNA', 'RNA, ncRNA, circRNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('piRNA', 'RNA, ncRNA, sncRNA, small_regulatory_ncRNA, piRNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('miRNA', 'RNA, ncRNA, sncRNA, small_regulatory_ncRNA, miRNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('pre_miRNA',
                                                                              'RNA, precursor_RNA, ncRNA, sncRNA, small_regulatory_ncRNA, miRNA, pre_miRNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('snRNA', 'RNA, ncRNA, sncRNA, snRNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('sRNA', 'RNA, ncRNA, sncRNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('snoRNA', 'RNA, ncRNA, sncRNA, snoRNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('Y_RNA', 'RNA, ncRNA, Y_RNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('scaRNA', 'RNA, ncRNA, sncRNA, snoRNA, scaRNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('scRNA', 'RNA, ncRNA, sncRNA, scRNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('tRNA', 'RNA, ncRNA, sncRNA, tRNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('SRP_RNA', 'RNA, ncRNA, SRP_RNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('ncRNA', 'RNA, ncRNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('rRNA', 'RNA, ncRNA, rRNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('ribozyme', 'RNA, ncRNA, enzymatic_RNA, ribozyme')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('hammerhead_ribozyme',
                                                                              'RNA, ncRNA, enzymatic_RNA, ribozyme, hammerhead_ribozyme')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('RNase_P_RNA', 'RNA, ncRNA, enzymatic_RNA, RNase_P_RNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('vault_RNA', 'RNA, ncRNA, vault_RNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('misc_RNA', 'RNA, ncRNA')
rnacentral_properties['Category'] = rnacentral_properties['Category'].replace('telomerase_RNA', 'RNA, ncRNA, telomerase_RNA')
rnacentral_properties['Category'].unique()

In [None]:
rnacentral_properties[':ID'] = "https://rnacentral.org/rna/" + rnacentral_properties['Name'].astype(str) + "_9606"
rnacentral_properties['Category'] = rnacentral_properties['Category'].str.split(", ").apply(lambda items: [i for i in items]).apply(json.dumps)
rnacentral_properties['Genomic_location'] = rnacentral_properties['Genomic_location'].apply(lambda x: json.dumps(list(x)))
rnacentral_properties['Sequence'] = rnacentral_properties['Sequence'].str.replace('T', 'U')
rnacentral_properties = rnacentral_properties.rename(columns={'Name':'RNAcentral_ID', 'Category':':TYPE', 'description':'Label'})
rnacentral_properties['Species'] = 'Homo sapiens'
rnacentral_properties.head(n=3)

In [None]:
rnacentral_properties.Label = rnacentral_properties.Label.str.replace("\(human\) ", "")
rnacentral_properties.head(n=3)

We add secondary structures (cloverleaves) from tRNA sequences from GtRNAdb.

In [None]:
rnacentral_map_gtrnadb = pd.read_csv(processed_data_location + "RNAcentral_MAP/gtrnadb.tsv",sep='\t',
                                     names=['RNAcentral ID', 'DB', 'GtRNAdb transcript ID', 'Organism', 'RNA category', 'GtRNAdb Gene ID'])
rnacentral_map_human_gtrnadb = rnacentral_map_gtrnadb[rnacentral_map_gtrnadb['Organism'] == 9606].drop(
    columns=['Organism', 'DB', 'RNA category'])
trna = rnacentral_map_human_gtrnadb[['GtRNAdb Gene ID']].drop_duplicates()
trna.head(n=3)

In [None]:
# Example to show retrieval logic
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
tRNA = pd.read_html('http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/genes/tRNA-Thr-TGT-2-1.html')[0].T
tRNA2 = pd.read_html('http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/genes/tRNA-Thr-TGT-2-1.html')[1].T
tRNA = pd.concat([tRNA,tRNA2],axis=1)
tRNA.columns = tRNA.iloc[0]
tRNA = tRNA[1:][['RNAcentral ID','Secondary Structure (nested bp)']]
tRNA

In [None]:
from tqdm import tqdm
for identifier in tqdm(trna['GtRNAdb Gene ID'], desc="Processing tRNA structures"):
    try:
        temp = pd.read_html('http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/genes/' + identifier + '.html')[0].T
        temp2 = pd.read_html('http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/genes/' + identifier + '.html')[1].T
        temp = pd.concat([temp, temp2], axis=1)
        temp.columns = temp.iloc[0]
        temp = temp[1:]
        tRNA = pd.concat([tRNA, temp])
    except:
        pass

tRNA = tRNA[['RNAcentral ID','Secondary Structure (nested bp)']].drop_duplicates().rename(
    columns={'Secondary Structure (nested bp)':'Structure','RNAcentral ID':'RNAcentral_ID'})
tRNA['RNAcentral_ID'] = tRNA['RNAcentral_ID'].str.replace("_9606","")
tRNA['Structure'] = tRNA['Structure'].replace("",np.nan)
tRNA.to_pickle(unprocessed_property_data_location + 'gtrnadb.pkl')
#tRNA = pd.read_pickle(unprocessed_property_data_location + 'gtrnadb.pkl')
tRNA.head(n=3)

In [None]:
rnacentral_properties = rnacentral_properties.merge(tRNA, on='RNAcentral_ID', how='left')
rnacentral_properties.to_pickle(unprocessed_property_data_location + 'RNAcentral.pkl')
rnacentral_properties.head(n=3)

* Ensembl

In [None]:
!wget ftp://ftp.ensembl.org/pub/release-113/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz -O $unprocessed_property_data_location/Homo_sapiens.GRCh38.cdna.all.fa.gz
!gunzip $unprocessed_property_data_location/Homo_sapiens.GRCh38.cdna.all.fa.gz
!wget ftp://ftp.ensembl.org/pub/release-113/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz -O  $unprocessed_property_data_location/Homo_sapiens.GRCh38.ncrna.fa.gz
!gunzip $unprocessed_property_data_location/Homo_sapiens.GRCh38.ncrna.fa.gz
!wget ftp://ftp.ensembl.org/pub/release-113/tsv/homo_sapiens/Homo_sapiens.GRCh38.113.canonical.tsv.gz -O  $unprocessed_property_data_location/Homo_sapiens.GRCh38.113.canonical.tsv.gz
!gunzip $unprocessed_property_data_location/Homo_sapiens.GRCh38.113.canonical.tsv.gz
!wget ftp://ftp.ensembl.org/pub/release-113/gtf/homo_sapiens/Homo_sapiens.GRCh38.113.gtf.gz -O  $unprocessed_property_data_location/Homo_sapiens.GRCh38.113.gtf.gz
!gunzip $unprocessed_property_data_location/Homo_sapiens.GRCh38.113.gtf.gz

In [None]:
rna_records = []
for record in SeqIO.parse(unprocessed_property_data_location + "Homo_sapiens.GRCh38.ncrna.fa", "fasta"):
    desc = record.description
    # Safely retrieve description
    if "description:" in desc:
        part_desc = desc.split("description:", 1)[1]
        if " [Source" in part_desc:
            description = part_desc.split(" [Source", 1)[0]
        else:
            description = part_desc
    else:
        description = ""

    # Safely retrieve transcript biotype
    if "transcript_biotype:" in desc:
        part_type = desc.split("transcript_biotype:", 1)[1]
        if " gene_symbol:" in part_type:
            rna_type = part_type.split(" gene_symbol:", 1)[0]
        elif " description:" in part_type:
            rna_type = part_type.split(" description:", 1)[0]
        else:
            rna_type = part_type
    else:
        rna_type = ""

    # Safely retrieve transcript biotype
    if "chromosome:GRCh38:" in desc:
        part_type = desc.split("chromosome:GRCh38:", 1)[1]
        if ":-1 " in part_type:
            genomic_loc = part_type.split(":-1 ", 1)[0] + "-"
        elif ":1 " in part_type:
            genomic_loc = part_type.split(":1 ", 1)[0] + "+"
    else:
        genomic_loc = ""

    rna_records.append({
        "ID": record.id.split(".")[0],
        "Sequence": str(record.seq),
        "Description": description,
        ":TYPE": rna_type,
        "Genomic_location": genomic_loc
    })

for record in SeqIO.parse(unprocessed_property_data_location + "Homo_sapiens.GRCh38.cdna.all.fa", "fasta"):
    desc = record.description
    if "description:" in desc:
        part_desc = desc.split("description:", 1)[1]
        if " [Source" in part_desc:
            description = part_desc.split(" [Source", 1)[0]
        else:
            description = part_desc
    else:
        description = ""

    if "transcript_biotype:" in desc:
        part_type = desc.split("transcript_biotype:", 1)[1]
        if " gene_symbol:" in part_type:
            rna_type = part_type.split(" gene_symbol:", 1)[0]
        elif " description:" in part_type:
            rna_type = part_type.split(" description:", 1)[0]
        else:
            rna_type = part_type
    else:
        rna_type = ""

    # Safely retrieve transcript biotype
    if "chromosome:GRCh38:" in desc:
        part_type = desc.split("chromosome:GRCh38:", 1)[1]
        if ":-1 " in part_type:
            genomic_loc = part_type.split(":-1 ", 1)[0] + "-"
        elif ":1 " in part_type:
            genomic_loc = part_type.split(":1 ", 1)[0] + "+"
    else:
        genomic_loc = ""

    rna_records.append({
        "ID": record.id.split(".")[0],
        "Sequence": str(record.seq),
        "Description": description,
        ":TYPE": rna_type,
        "Genomic_location": genomic_loc
    })

rnacentral_map_ensembl = pd.read_csv(processed_data_location + 'RNAcentral_MAP/ensembl.tsv',
    sep='\t', names=['RNAcentral ID', 'DB', 'Ensembl transcript ID', 'Organism', 'RNA category', 'Ensembl Gene ID'])
rnacentral_map_human_ensembl = rnacentral_map_ensembl[rnacentral_map_ensembl['Organism'] == 9606].drop(
    columns=['Organism', 'DB', 'RNA category'])
rnacentral_map_human_ensembl['Ensembl Gene ID'] = rnacentral_map_human_ensembl['Ensembl Gene ID'].str.split('.').str[0]

ensembl = pd.DataFrame(rna_records)
ensembl['Genomic_location'] = "chr" + ensembl['Genomic_location'].astype(str)
ensembl['Genomic_location'] = ensembl['Genomic_location'].str.replace(r'^([^:]*:[^:]*):', r'\1-', regex=True)
ensembl['Description'] = ensembl['Description'].str.lower()
ensembl['Sequence'] = ensembl['Sequence'].str.upper()

ensembl = ensembl[~ensembl['ID'].isin(rnacentral_map_human_ensembl['Ensembl transcript ID'])]
canonical_transcripts = pd.read_csv(unprocessed_property_data_location + "Homo_sapiens.GRCh38.113.canonical.tsv", sep="\t", header=None)
canonical_transcripts[1] = canonical_transcripts[1].str.split(".").str[0]
ensembl = ensembl[~ensembl['ID'].isin(canonical_transcripts[1])]

ensembl['Sequence'] = ensembl['Sequence'].replace("", np.nan)
ensembl['Description'] = ensembl['Description'].replace("", np.nan)
ensembl['Genomic_location'] = ensembl['Genomic_location'].replace("chr", np.nan)
ensembl['Genomic_location'] = ensembl['Genomic_location'].replace("", np.nan)
ensembl[':TYPE'] = ensembl[':TYPE'].replace("", np.nan)
ensembl[':TYPE'] = ensembl[':TYPE'].replace(np.nan, "RNA")

ensembl.head(n=3)

In [None]:
gtf = pd.read_csv(unprocessed_property_data_location + "Homo_sapiens.GRCh38.113.gtf", sep="\t", comment="#", header=None)
gtf = gtf[gtf[2] == "transcript"]
gtf["ID"] = gtf[8].str.extract(r'transcript_id "([^"]+)"')
gtf["Label"] = gtf[8].str.extract(r'transcript_name "([^"]+)"')
gtf = gtf[["ID", "Label"]]
gtf.head(n=3)

In [None]:
ensembl = ensembl.merge(gtf, how='left', on='ID')
ensembl['Label'] = ensembl['Label'].replace("", np.nan)
ensembl.head(n=3)

In [None]:
ensembl_el = ensembl[['ID',':TYPE']].drop_duplicates().copy()
ensembl_el.to_pickle(unprocessed_property_data_location + 'ensembl_el.pkl')

In [None]:
ensembl_properties = ensembl.groupby('ID').agg({
    ':TYPE': set,
    'Description': lambda x: x.dropna().iloc[0] if not x.dropna().empty else np.nan,
    'Label': lambda x: x.dropna().iloc[0] if not x.dropna().empty else np.nan,
    'Genomic_location': lambda x: set(x.dropna()) if x.dropna().any() else set(),
    'Sequence': lambda x: x.dropna().iloc[0] if not x.dropna().empty else np.nan
}).reset_index()
ensembl_properties.head(n=3)

In [None]:
ensembl_properties[':TYPE'] = ensembl_properties[':TYPE'].astype(str)
ensembl_properties[':TYPE'] = ensembl_properties[':TYPE'].replace('{\'', '', regex=True)
ensembl_properties[':TYPE'] = ensembl_properties[':TYPE'].replace('\'}', '', regex=True)
ensembl_properties[':TYPE'] = ensembl_properties[':TYPE'].replace('\'', '', regex=True)
ensembl_properties[':TYPE'].unique()

In [None]:
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('lncRNA', 'RNA, ncRNA, lncRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('', 'RNA, ncRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('antisense_RNA', 'RNA, ncRNA, antisense_RNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('precursor_RNA', 'RNA, precursor_RNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('guide_RNA', 'RNA, ncRNA, guide_RNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('autocatalytically_spliced_intron',
                                                                  'RNA, ncRNA, intron, autocatalytically_spliced_intron')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('RNase_MRP_RNA',  'RNA, ncRNA, enzymatic_RNA, RNase_MRP_RNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('tmRNA', 'RNA, ncRNA, sncRNA, small_regulatory_ncRNA, tmRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('other', 'RNA, ncRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('circRNA', 'RNA, ncRNA, circRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('piRNA', 'RNA, ncRNA, sncRNA, small_regulatory_ncRNA, piRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('miRNA', 'RNA, ncRNA, sncRNA, small_regulatory_ncRNA, miRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('pre_miRNA',
                                                                  'RNA, precursor_RNA, ncRNA, sncRNA, small_regulatory_ncRNA, miRNA, pre_miRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('snRNA', 'RNA, ncRNA, sncRNA, snRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('sRNA', 'RNA, ncRNA, sncRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('snoRNA', 'RNA, ncRNA, sncRNA, snoRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('Y_RNA', 'RNA, ncRNA, Y_RNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('scaRNA', 'RNA, ncRNA, sncRNA, snoRNA, scaRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('tRNA', 'RNA, ncRNA, sncRNA, tRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('SRP_RNA', 'RNA, ncRNA, SRP_RNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('ncRNA', 'RNA, ncRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('rRNA', 'RNA, ncRNA, rRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('ribozyme', 'RNA, ncRNA, enzymatic_RNA, ribozyme')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('hammerhead_ribozyme',
                                                                  'RNA, ncRNA, enzymatic_RNA, ribozyme, hammerhead_ribozyme')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('RNase_P_RNA', 'RNA, ncRNA, enzymatic_RNA, RNase_P_RNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('vault_RNA', 'RNA, ncRNA, vault_RNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('misc_RNA', 'RNA, ncRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('TEC', 'RNA, ncRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('retained_intron', 'RNA, ncRNA, intron, retained_intron')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('telomerase_RNA', 'RNA, ncRNA, telomerase_RNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('protein_coding', 'RNA, mRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('TR_J_gene', 'RNA, mRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('TR_D_gene', 'RNA, mRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('IG_V_gene', 'RNA, mRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('IG_C_gene', 'RNA, mRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('IG_D_gene', 'RNA, mRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('IG_J_gene', 'RNA, mRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('TR_C_gene', 'RNA, mRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('TR_V_gene', 'RNA, mRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('protein_coding_LoF', 'RNA, mRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('protein_coding_LoF', 'RNA, mRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('protein_coding_CDS_not_defined', 'RNA, mRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('non_stop_decay', 'RNA, mRNA, RNA_decay')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('nonsense_mediated_decay', 'RNA, mRNA, RNA_decay')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('rRNA_pseudogene', 'RNA, ncRNA, rRNA, RNA_pseudogene')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('IG_C_pseudogene', 'RNA, RNA_pseudogene')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('processed_transcript', 'RNA, RNA_pseudogene')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('processed_pseudogene', 'RNA, RNA_pseudogene')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('TR_J_pseudogene', 'RNA, RNA_pseudogene')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('IG_pseudogene', 'RNA, RNA_pseudogene')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('transcribed_unprocessed_pseudogene', 'RNA, RNA_pseudogene')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('unprocessed_pseudogene', 'RNA, RNA_pseudogene')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('unitary_pseudogene', 'RNA, RNA_pseudogene')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('transcribed_unitary_pseudogene', 'RNA, RNA_pseudogene')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('IG_V_pseudogene', 'RNA, RNA_pseudogene')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('TR_V_pseudogene', 'RNA, RNA_pseudogene')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('pseudogene', 'RNA, RNA_pseudogene')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('translated_processed_pseudogene', 'RNA, RNA_pseudogene, mRNA')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('IG_J_pseudogene', 'RNA, RNA_pseudogene')
ensembl_properties[":TYPE"] = ensembl_properties[":TYPE"].replace('transcribed_processed_pseudogene', 'RNA, RNA_pseudogene')

ensembl_properties[":TYPE"].unique()

In [None]:
ensembl_properties[':ID'] = "https://www.ensembl.org/Homo_sapiens/Transcript/Summary?t=" + ensembl_properties['ID'].astype(str)
ensembl_properties[':TYPE'] = ensembl_properties[':TYPE'].str.split(", ").apply(lambda items: [i for i in items]).apply(json.dumps)
ensembl_properties['Sequence'] = ensembl_properties['Sequence'].str.replace("T","U")
ensembl_properties['Genomic_location'] = ensembl_properties['Genomic_location'].apply(lambda x: json.dumps(list(x)))
ensembl_properties = ensembl_properties.rename(columns={'ID':'Ensembl_ID'})
ensembl_properties['Species'] = 'Homo sapiens'
ensembl_properties.to_pickle(unprocessed_property_data_location + 'ensembl.pkl')
ensembl_properties.head(n=3)

* Addgene

In [None]:
gRNA_gene = pd.read_csv(unprocessed_data_location + 'grna_sequences_addgene.txt', sep='\t', dtype = {"Plasmid ID":str})  
gRNA_gene.columns=gRNA_gene.columns.str.rstrip()
gRNA_gene = gRNA_gene[gRNA_gene['Target Species'].notna()]
gRNA_gene = gRNA_gene[gRNA_gene['Target Species'].str.contains('apiens')]
gRNA_gene = gRNA_gene[~gRNA_gene['Plasmid ID'].isna()]
gRNA_gene['Plasmid ID'] = 'www.addgene.org/'+gRNA_gene['Plasmid ID'].astype(str).str.rstrip()
gRNA_gene['Target Gene'] = gRNA_gene['Target Gene'].str.upper().astype(str).str.rstrip()
gRNA_gene.drop(columns=['Target Species','Depositor'],inplace=True)
gRNA_gene['Cas9 Species'] = gRNA_gene['Cas9 Species'].str.strip()

gRNA_gene['Target Sequence'] = gRNA_gene['Target Sequence'].str.replace(' $', '', regex=True)
gRNA_gene['Target Sequence'] = gRNA_gene['Target Sequence'].str.replace('gRNA1: ', '')
gRNA_gene['Target Sequence'] = gRNA_gene['Target Sequence'].str.replace('gRNA1:', '')
gRNA_gene['Target Sequence'] = gRNA_gene['Target Sequence'].str.replace(', gRNA2', '|')
gRNA_gene['Target Sequence'] = gRNA_gene['Target Sequence'].str.replace(r'; gRNA2:\s*', ', ', regex=True)
gRNA_gene['Target Sequence'] = gRNA_gene['Target Sequence'].str.split(", ")
gRNA_gene = gRNA_gene.explode('Target Sequence')

gRNA_gene = gRNA_gene.groupby(['Plasmid ID']).agg({'Cas9 Species': 'first', 'Target Sequence':'first'}).reset_index()
gRNA_gene[':TYPE'] = 'RNA, ncRNA, gRNA'
gRNA_gene[':ID'] = 'https://' + gRNA_gene['Plasmid ID']
gRNA_gene['Plasmid ID'] = gRNA_gene['Plasmid ID'].str.replace("www.addgene.org/", "")
gRNA_gene['Target Sequence'] = gRNA_gene['Target Sequence'].str.replace("T", "U")
gRNA_gene.rename(columns={'Plasmid ID':'Addgene_ID','Cas9 Species':'Species','Target Sequence':'Sequence'},inplace=True)
gRNA_gene = gRNA_gene.drop_duplicates(subset=[':ID'],keep='first')
gRNA_gene[':TYPE'] = gRNA_gene[':TYPE'].str.split(", ").apply(lambda items: [i for i in items]).apply(json.dumps)
gRNA_gene.to_pickle(unprocessed_property_data_location + 'addgene.pkl')
gRNA_gene.head(n=3)

* [The MIT/ICBP siRNA Database](http://web.mit.edu/sirna/index.html) <br /> The MIT/ICBP siRNA Database has validated siRNA and shRNA sequences against over 100 genes.

In [None]:
ICBP = pd.read_html('http://web.mit.edu/sirna/sirnas-gene.html') # siRNA
ICBP = ICBP[1]
ICBP.columns = ICBP.iloc[[0]].squeeze()
ICBP.drop(0, inplace=True)
ICBP[['ID#']] = ICBP[['ID#']] + '.html'
ICBP.head(n=3)

In [None]:
def process_sirna_data(ICBPsiRNA):
    """
    Processes a DataFrame of sirna data, fetches, extracts and processes information for each sirna ID.
    """
    # Initialize an empty DataFrame to hold the results
    property_df = pd.DataFrame()

    # Iterate over each unique sirna_id
    for sirna_id in ICBPsiRNA['ID#'].unique():
        
        url = 'http://web.mit.edu/sirna/sequences/results-' + sirna_id
        
        # Read the HTML content and extract the table using pandas
        ICBP = pd.read_html(url)
        df = ICBP[1]
        df = df.T  # Transpose the dataframe
        df.reset_index(drop=True, inplace=True)
        df.rename(columns={3:'Sequence'}, inplace=True)
        df=df[['Sequence']]

        # Extract specific information from the columns using regex
        df['Sequence'] = df['Sequence'].str.extract(r'Sense sequence: (.*)')

        # Fill NaN values with empty strings
        df['Sequence'] = df['Sequence'].fillna('')  

        # Combine relevant rows into one DataFrame for a clean representation
        df_combined = pd.DataFrame({
            'Sequence': df.iloc[0]['Sequence']
        }, index=[0])
        df_combined['ID'] = sirna_id
        property_df = pd.concat([property_df, df_combined], ignore_index=True)

    # Return the final DataFrame containing all the processed data
    return property_df

ICBPsiRNA = ICBP.loc[(ICBP['siRNA'] == 'x') & (ICBP['Human'] == 'x')]
sirna = process_sirna_data(ICBPsiRNA)
sirna[':TYPE'] = "RNA, ncRNA, sncRNA, small_regulatory_ncRNA, siRNA"
sirna.head(n=3)

In [None]:
def process_shrna_data(ICBPsiRNA):
    # Initialize an empty DataFrame to hold the results
    property_df = pd.DataFrame()

    # Iterate over each unique sirna_id
    for sirna_id in ICBPsiRNA['ID#'].unique():
        
        url = 'http://web.mit.edu/sirna/sequences/results-' +sirna_id
        
        # Read the HTML content and extract the table using pandas
        ICBP = pd.read_html(url)
        #print(url)
        df = ICBP[1]
        df = df.T  # Transpose the dataframe
        df.reset_index(drop=True, inplace=True)
        df.rename(columns={3:'Sequence'}, inplace=True)
        df=df[["Sequence"]]

        # Extract specific information from the columns using regex
        df['Sequence'] = df['Sequence'].str.extract(r'Sequence: (.*)')
        df['Sequence'] = df['Sequence'].str.replace(" ", "")
        df['Sequence'] = df['Sequence'].fillna('')  

        # Combine relevant rows into one DataFrame for a clean representation
        df_combined = pd.DataFrame({
            'Sequence': df.iloc[0]['Sequence']
        }, index=[0])
        df_combined['ID'] = sirna_id
        property_df = pd.concat([property_df, df_combined])

    return property_df

ICBPshRNA = ICBP.loc[(ICBP['shRNA'] == 'x') & (ICBP['Human'] == 'x')] # shRNA
shrna = process_shrna_data(ICBPshRNA)
shrna[':TYPE'] = "RNA, ncRNA, sncRNA, small_regulatory_ncRNA, shRNA"
shrna.head(n=3)

In [None]:
icbp = pd.concat([sirna,shrna])
icbp['Sequence'] = icbp['Sequence'].str.replace("T", "U")
icbp[':ID'] = "http://web.mit.edu/sirna/sequences/results-" + icbp['ID']
icbp['ICBP_ID'] = icbp['ID'].str.replace(".html", "")
icbp = icbp.drop_duplicates(subset=[':ID'],keep='first')
icbp.drop(columns=['ID'],inplace=True)
icbp[':TYPE'] = icbp[':TYPE'].str.split(", ").apply(lambda items: [i for i in items]).apply(json.dumps)
icbp.to_pickle(unprocessed_property_data_location + 'icbp.pkl')
icbp.head(n=3)

* circBase

First, we convert hg19 genomic coordinates to hg38 using:
https://genome.ucsc.edu/cgi-bin/hgLiftOver 

In [None]:
circbase = pd.read_csv(unprocessed_data_location + 'hsa_hg19_circRNA.txt', sep='\t')
circbase_gen = circbase[['# chrom','start','end']]
circbase_gen['Genomic_coordinates'] = circbase_gen['# chrom'].astype(str) + ":" + circbase_gen['start'].astype(str) + "-" + circbase_gen['end'].astype(str)
pd.DataFrame(circbase_gen['Genomic_coordinates']).to_csv(unprocessed_property_data_location + 'circbase_genomic_coordinateshg19.csv', index=False, header=False)

In [None]:
# Remove convertion errors
circbase_gen_err = pd.read_csv(unprocessed_property_data_location + 'hglft_genome_2e13fc_68d250.err', header=None, comment='#')
circbase_gen = circbase_gen[['Genomic_coordinates']]
circbase_gen = circbase_gen[~circbase_gen['Genomic_coordinates'].isin(circbase_gen_err[0])].reset_index(drop=True)

circbase_gen38 = pd.read_csv(unprocessed_property_data_location + 'hglft_genome_2e13fc_68d250.bed', header=None)
circbase_gen38 = pd.concat([circbase_gen38, circbase_gen], axis=1).rename(columns={0:'Genomic_location', 'Genomic_coordinates':'Genomic_location_hg19'})
circbase_gen38.head(n=3)

In [None]:
from xml.etree import ElementTree as ET

circbase = pd.read_csv(unprocessed_data_location + 'hsa_hg19_circRNA.txt', sep='\t')

def fetch_sequence(db, chrom, start, end):
    base_url = "http://genome.ucsc.edu/cgi-bin/das"
    url = f"{base_url}/{db}/dna?segment={chrom}:{start},{end}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        xml_root = ET.fromstring(response.content)
        dna_sequence = "".join(xml_root.find(".//DNA").text.splitlines())
        return dna_sequence.upper().replace(" ", "")
    except Exception as e:
        print(f"Error fetching sequence for {chrom}:{start}-{end}: {e}")
        return None

# We can retrieve the full sequence only for circRNAs we know the spliced seq is the same as the genomic seq.
# Without this filter, we would retrieve the full genomic sequence (DNA --> gene level) for all circRNAs. 
circbase['Genomic_location_hg19'] = circbase['# chrom'].astype(str) + ":" + circbase['start'].astype(str) + "-" + circbase['end'].astype(str)
circbase = pd.merge(circbase, circbase_gen38, left_on='Genomic_location_hg19', right_on='Genomic_location_hg19', how='left').drop(
    columns=['Genomic_location_hg19', '# chrom','start','end'])

circbase_seq = circbase[circbase['genomic length'] == circbase['spliced seq length']]
circbase_seq['# chrom'] = circbase_seq['Genomic_location'].str.split(":").str[0]
circbase_seq['start'] = circbase_seq['Genomic_location'].str.split(":").str[1].str.split("-").str[0]
circbase_seq['end'] = circbase_seq['Genomic_location'].str.split(":").str[1].str.split("-").str[1]
circbase_seq = circbase_seq[['Genomic_location','# chrom','start','end']].reset_index(drop=True)
circbase_seq['Sequence'] = circbase_seq.apply(lambda x: fetch_sequence("hg38", x['# chrom'], x['start'], x['end']), axis=1)
circbase_seq.to_csv(unprocessed_property_data_location + 'circbase_seq.csv', index=False)
#circbase_seq = pd.read_csv(unprocessed_property_data_location + 'circbase_seq.csv')
circbase_seq.head(n=3)

In [None]:
circbase_seq = pd.read_csv(unprocessed_property_data_location + 'circbase_seq.csv')
circbase_seq.head(n=3)

In [None]:
circbase['Genomic_location'] = circbase['# chrom'].astype(str) + ":" + circbase['start'].astype(str) + "-" +\
    circbase['end'].astype(str) + circbase['strand'].astype(str)
circbase = pd.merge(circbase, circbase_seq, on=['Genomic_location'], how='left')
circbase = circbase.rename(columns={'circRNA ID':'circBase_ID'})
circbase['Label'] = circbase['circBase_ID']
circbase[':ID'] = "http://circbase.org/cgi-bin/singlerecord.cgi?id=" + circbase['circBase_ID']

circbase = circbase.groupby(':ID').agg({
    'circBase_ID': lambda x: x.dropna().iloc[0] if not x.dropna().empty else np.nan,
    'Label': lambda x: x.dropna().iloc[0] if not x.dropna().empty else np.nan,
    'Genomic_location': lambda x: set(x.dropna()) if x.dropna().any() else set(),
    'Sequence': lambda x: x.dropna().iloc[0] if not x.dropna().empty else np.nan
}).reset_index()

circbase[':TYPE'] = 'RNA, ncRNA, circRNA'
circbase[':TYPE'] = circbase[':TYPE'].str.split(", ").apply(lambda items: [i for i in items]).apply(json.dumps)
circbase['Sequence'] = circbase['Sequence'].astype(str).str.replace('T','U')
circbase['Sequence'] = circbase['Sequence'].replace('nan',np.nan)
circbase['Genomic_location'] = circbase['Genomic_location'].apply(lambda x: json.dumps(list(x)))
circbase['Species'] = 'Homo sapiens'
circbase.to_pickle(unprocessed_property_data_location + 'circbase.pkl')
circbase.head(n=3)

* [eSkip-Finder](https://eskip-finder.org/cgi-bin/input.cgi) <br /> eSkip-Finder is the first machine learning-based design tool and database of antisense oligonucleotides (ASOs) for exon skipping. A significant challenge, however, is the difficulty in selecting an optimal target sequence for exon skipping.

In [None]:
# https://eskip-finder.org/ --> Search the Database --> Search 'All' on Species=human
ASO_mRNA = pd.read_html(unprocessed_data_location + 'eSkip-Finder.html')[2]
ASO_mRNA = ASO_mRNA[ASO_mRNA['Species'] == 'human']
ASO_mRNA = ASO_mRNA[ASO_mRNA['Oligo name in literature'] != 'Null']
ASO_mRNA = ASO_mRNA[['Oligo name in literature','Oligo sequence /: Cocktail. -: weasel (connected).', 'Oligo chemistry']]
print(ASO_mRNA['Oligo chemistry'].unique())
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].replace('unspecified', 'nan')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].replace('unspecified', 'nan')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].replace('unspecified', 'nan')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].replace('unspecified', 'nan')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].replace('Others (unspecified)', 'nan')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].replace('Others (tc-DNA)', 'nan')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].replace("Others (mixed 2'OMe, F, and stereochemistry)", 'nan')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].replace('Others (mixed (see Appendix))', 'nan')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].replace("Others (2'-OMePS conjugated to cyclic peptide)",  'nan')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].replace('Others (2FPS)', 'nan')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].replace("Others (2'-MOE)", 'nan')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].replace("Others (LNA with phosphodiester linkage)", 'nan')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].replace('Null',  'nan')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].str.replace('2MOE', '2OMOE')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].str.replace(' (PPMO)', '')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].str.replace('mofified', 'modified')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].str.replace(' (BPMO)', '')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].replace('unmodified PMO', 'PMO, unmodified PMO')
ASO_mRNA['Oligo chemistry'] = ASO_mRNA['Oligo chemistry'].replace('modified PMO', 'PMO, modified PMO')
print(ASO_mRNA['Oligo chemistry'].unique())
ASO_mRNA[':TYPE'] = 'RNA, ncRNA, sncRNA, oligo, antisense_oligonucleotide, RNA_antisense_oligonucleotide, ' +\
    ASO_mRNA['Oligo chemistry'].str.replace(', nan', '')
ASO_mRNA[':TYPE'] = ASO_mRNA[':TYPE'].str.split(", ").apply(lambda items: [i for i in items]).apply(json.dumps)
ASO_mRNA['Oligo sequence /: Cocktail. -: weasel (connected).'] = \
    ASO_mRNA['Oligo sequence /: Cocktail. -: weasel (connected).'].astype(str).str.replace("T",'U')
ASO_mRNA[':ID'] = 'https://eskip-finder.org/cgi-bin/input.cgi?' + ASO_mRNA['Oligo name in literature'].str.replace(r'\s+', '_', regex=True)
ASO_mRNA = ASO_mRNA.rename(columns={'Oligo sequence /: Cocktail. -: weasel (connected).':'Sequence','Oligo name in literature':'Label'})
ASO_mRNA = ASO_mRNA[[':ID','Label',':TYPE','Sequence']].drop_duplicates(subset=[':ID'],keep='first')
ASO_mRNA.to_pickle(unprocessed_property_data_location + 'eskipFinder.pkl')
ASO_mRNA.head(n=3)

* tsRFun

In [None]:
tsRNA = pd.read_csv(unprocessed_data_location + 'newID_20210202.txt', sep="\t")[['type','seq','tsRNAid']]
tsRNA[':TYPE'] = [["RNA", "ncRNA", "sncRNA", "tsRNA", "tRF"]] * len(tsRNA)
print(tsRNA['type'].unique())
tsRNA[':TYPE'] = tsRNA.apply(lambda row: row[':TYPE'] + [row['type']], axis=1)
tsRNA[':TYPE'] = tsRNA[':TYPE'].apply(json.dumps)
tsRNA['seq'] = tsRNA['seq'].str.replace("T","U")
tsRNA = tsRNA.rename(columns={'seq':'Sequence', 'tsRNAid':'tsRFun_ID'}).drop(columns=['type'])
tsRNA[':ID'] = 'http://biomed.nscc-gz.cn/DB/tsRFun/searchDetail-tsRNA.php?tsRNAid=' + tsRNA['tsRFun_ID']
tsRNA = tsRNA.drop_duplicates(subset=[':ID'],keep='first')
tsRNA['Species'] = 'Homo sapiens'
tsRNA.to_pickle(unprocessed_property_data_location + 'tsrfun.pkl')
tsRNA.head(n=3)

* tRFdb

In [None]:
#http://genome.bioch.virginia.edu/trfdb/index.php
tRF1_tRNA = pd.read_html(unprocessed_data_location+'trf1.html')[2]
tRF1_tRNA.drop(columns=['Organism'],inplace=True)
tRF1_tRNA.head()

tRF3_tRNA = pd.read_html(unprocessed_data_location+'trf3.html')[2]
tRF3_tRNA.drop(columns=['Organism'],inplace=True)

tRF5_tRNA = pd.read_html(unprocessed_data_location+'trf5.html')[2]
tRF5_tRNA.drop(columns=['Organism'],inplace=True)

tRF_tRNA = pd.concat([tRF1_tRNA,tRF3_tRNA,tRF5_tRNA])
tRF_tRNA = tRF_tRNA.drop(columns=['Experiment Info', 'Sequence'])
tRF_tRNA['tRF ID'] = tRF_tRNA['tRF ID'].astype(str)
tRF_tRNA.head(n=3)

In [None]:
import re

def get_numbers(identifier):
    
    html_file_path = unprocessed_data_location + 'trf' + identifier + '.html'

    with open(html_file_path, 'r', encoding='utf-8') as html_file:
        html_content = html_file.read()

    pattern = r'href=\'sequence_display.php\?seq_id=(\d+)'
    matches = re.findall(pattern, html_content)
    numbers = [int(match) for match in matches]

    pattern2 = r"href='experiments_display.php\?trf_id=(.*?)'"
    matches2 = re.findall(pattern2, html_content)
    
    # Return the numbers as a dictionary
    return {'sequence_numbers': numbers, 'experiment_numbers': matches2}

In [None]:
def transform(original_html):

    transformed_html = re.sub(r'<font face=', '\n<font face=', original_html)
    transformed_html = re.sub(r'<br><b>Organism:', "</font><br>\n<font face='Arial' size='2'><b>Organism:", transformed_html)
    transformed_html = re.sub(r'<br><b>tRF Sequence:', "</font><br>\n<font face='Arial' size='2'><b>tRF Sequence:", transformed_html)
    transformed_html = re.sub(r"<font face='Courier' size='3'>", "</font><br>\n<font face='Arial' size='2'>", transformed_html)
    transformed_html = re.sub(r"<br><b>Map Position:", "\n<font face='Arial' size='2'><b>Map Position:", transformed_html)

    return transformed_html

In [None]:
import requests

def get_html(identifier):
    url = 'http://genome.bioch.virginia.edu/trfdb/sequence_display.php?seq_id=' + identifier
    response = requests.get(url)
    if response.status_code == 500:
        html_content = response.text
        return html_content

In [None]:
from bs4 import BeautifulSoup

df = pd.DataFrame()
result = get_numbers('1')
numbers_mapping = dict(zip(result['sequence_numbers'], result['experiment_numbers']))

for i in result['sequence_numbers'] :
    
    html_content = get_html(str(i))  # Retrieve HTML content
    if html_content is not None:
        # Apply the transformation to the HTML content
        transformed_html = transform(html_content)

        # Continue with parsing and DataFrame creation
        soup = BeautifulSoup(transformed_html, 'html.parser')
        values = [font.get_text() for font in soup.find_all('font')]
        values = [value.split(":")[1].strip() if ":" in value else value for value in values]
        
        corresponding_experiment_number = numbers_mapping.get(i, None)

        # Create a DataFrame for the current HTML page
        temp = pd.DataFrame(values).T
        temp.columns = range(temp.shape[1])

        # Add the 'Experiment Number' column
        temp['Experiment Number'] = corresponding_experiment_number

        # Concatenate the current DataFrame with the main DataFrame
        df = pd.concat([df, temp], ignore_index=True)
 
result = get_numbers('3')
numbers_mapping = dict(zip(result['sequence_numbers'], result['experiment_numbers']))

for i in result['sequence_numbers'] :
    
    html_content = get_html(str(i))  # Retrieve HTML content
    if html_content is not None:
        # Apply the transformation to the HTML content
        transformed_html = transform(html_content)

        # Continue with parsing and DataFrame creation
        soup = BeautifulSoup(transformed_html, 'html.parser')
        values = [font.get_text() for font in soup.find_all('font')]
        values = [value.split(":")[1].strip() if ":" in value else value for value in values]
        
        corresponding_experiment_number = numbers_mapping.get(i, None)

        # Create a DataFrame for the current HTML page
        temp = pd.DataFrame(values).T
        temp.columns = range(temp.shape[1])

        # Add the 'Experiment Number' column
        temp['Experiment Number'] = corresponding_experiment_number

        # Concatenate the current DataFrame with the main DataFrame
        df = pd.concat([df, temp], ignore_index=True)

result = get_numbers('5')
numbers_mapping = dict(zip(result['sequence_numbers'], result['experiment_numbers']))

for i in result['sequence_numbers'] :
    
    html_content = get_html(str(i))  # Retrieve HTML content
    if html_content is not None:
        # Apply the transformation to the HTML content
        transformed_html = transform(html_content)

        # Continue with parsing and DataFrame creation
        soup = BeautifulSoup(transformed_html, 'html.parser')
        values = [font.get_text() for font in soup.find_all('font')]
        values = [value.split(":")[1].strip() if ":" in value else value for value in values]
        
        corresponding_experiment_number = numbers_mapping.get(i, None)

        # Create a DataFrame for the current HTML page
        temp = pd.DataFrame(values).T
        temp.columns = range(temp.shape[1])

        # Add the 'Experiment Number' column
        temp['Experiment Number'] = corresponding_experiment_number

        # Concatenate the current DataFrame with the main DataFrame
        df = pd.concat([df, temp], ignore_index=True)        

In [None]:
def extract_chr_substring(text):
    start_index = text.find('chr')
    if start_index != -1:
        end_index = text.find('&', start_index)
        if end_index != -1:
            return text[start_index:end_index]
    return ''

#df['Experiment Number'] = df['Experiment Number'].apply(extract_chr_substring)
df.columns = ['tRF ID','organism','empty','Sequence','Map Position','tRNA Gene Co-ordinates']
df = df.drop(columns=['organism','empty'])
df

In [None]:
tRF = pd.merge(tRF_tRNA,df,on=['tRF ID', 'tRNA Gene Co-ordinates'])
tRF['tRF ID'] = "trfdb?" + tRF['tRF ID'].astype(str)
tRF

In [None]:
#tRF.drop_duplicates().to_csv(properties_location + 'tRF_tRFdb.csv', index=None)
tRF = pd.read_csv(unprocessed_property_data_location + 'tRF_tRFdb.csv')
print(tRF.Type.unique())
tRF['tRNA Gene Co-ordinates'] = tRF['tRNA Gene Co-ordinates'].str.split("-")
tRF['Chromosome'] = tRF['tRNA Gene Co-ordinates'].str[0]
tRF['Map Position'] = tRF['Map Position'].str.split("-")
tRF['Start'] = tRF['tRNA Gene Co-ordinates'].str[1].astype(int) + tRF['Map Position'].str[0].astype(int)
tRF['End'] = tRF['tRNA Gene Co-ordinates'].str[2].astype(int) + tRF['Map Position'].str[1].astype(int)
tRF['Genomic_location'] = tRF['Chromosome'] + ":" + tRF['Start'].astype(str) + "-" + tRF['End'].astype(str)
tRF.Type = tRF.Type.str.replace('trf', 'tRF')
tRF[':TYPE'] = [["RNA", "ncRNA", "sncRNA", "tsRNA", "tRF"]] * len(tRF)
tRF[':TYPE'] = tRF.apply(lambda row: row[':TYPE'] + [row['Type']], axis=1)
tRF = tRF.explode(':TYPE')
tRF['Sequence'] = tRF['Sequence'].str.replace("T","U")
tRF = tRF.groupby(['tRF ID']).agg({'Sequence':'first', ':TYPE':set, "Genomic_location":lambda x: set(x.dropna()) if x.dropna().any() else set()}).reset_index()
tRF['Species'] = 'Homo sapiens'
tRF[':ID'] = "http://genome.bioch.virginia.edu/trfdb/experiments_display.php?" + tRF['tRF ID']
tRF['tRF_ID'] = tRF['tRF ID'].str.replace("trfdb?","")
tRF[':TYPE'] = tRF[':TYPE'].apply(lambda items: [i for i in items]).apply(json.dumps)
tRF = tRF.drop(columns=['tRF ID'])
tRF.to_pickle(unprocessed_property_data_location + 'trfdb.pkl')
tRF.head(n=3)

* MINTBASE

In [None]:
tRNA_MINTbase_GtRNAdb_map=pd.read_csv(
    processed_data_location + 'tRNA_MINTbase_GtRNAdb_MAP.txt', header=None, sep='\t')
tRNA_MINTbase_GtRNAdb_map=tRNA_MINTbase_GtRNAdb_map.rename(columns={0:'MINTbase tRNA name',1:'gtRNAdb name'})
tRNA_MINTbase_GtRNAdb_map.head(n=3)

In [None]:
# https://cm.jefferson.edu/MINTbase/InputController?g=GRCh37&d=y&v=g&e=1.0&cl=,4,5,11,12,16,18,19,21,22,26,27,#ttop
tRF_tRNA2 = pd.read_csv(unprocessed_data_location+'MINTbase.txt',sep='\t')
tRF_tRNA2['MINTbase Alternative IDs (GRCh37 assembly-derived)'] = tRF_tRNA2['MINTbase Alternative IDs (GRCh37 assembly-derived)'].str.split('@').str[0]
tRF_tRNA2.rename(columns={'MINTbase Alternative IDs (GRCh37 assembly-derived)':'MINTbase tRNA name'},inplace=True)
tRF_tRNA2 = pd.merge(tRF_tRNA2, tRNA_MINTbase_GtRNAdb_map, on='MINTbase tRNA name')
print(tRF_tRNA2['Type'].unique())
tRF_tRNA2['Type'] = tRF_tRNA2['Type'].str.replace("5'-tRF", "tRF-5")
tRF_tRNA2['Type'] = tRF_tRNA2['Type'].str.replace("3'-tRF", "tRF-3")
tRF_tRNA2['Type'] = tRF_tRNA2['Type'].str.replace('i-tRF', 'tRF-i')
tRF_tRNA2['Type'] = tRF_tRNA2['Type'].str.replace("5'-half", "tRF-5, tRF-5-half")
tRF_tRNA2['Type'] = tRF_tRNA2['Type'].str.replace("3'-half", "tRF-3, tRF-3-half")
tRF_tRNA2[':TYPE'] = 'RNA, ncRNA, sncRNA, tsRNA, tRF, ' + tRF_tRNA2['Type'].astype(str)
tRF_tRNA2['Genomic_location'] = "chr" + tRF_tRNA2['Chromosome'].astype(str) + ":" + tRF_tRNA2['Chromosome start position'].astype(str) \
    + "-" + tRF_tRNA2['Chromosome end position'].astype(str) + tRF_tRNA2['Chromosome strand'].astype(str)
tRF_tRNA2[':TYPE'] = tRF_tRNA2[':TYPE'].str.split(", ")
tRF_tRNA2 = tRF_tRNA2.explode(':TYPE')
tRF_tRNA2['Fragment sequence'] = tRF_tRNA2['Fragment sequence'].str.replace("T","U")
tRF_tRNA2 = tRF_tRNA2.groupby(['License Plate (sequence derived)']).agg({'Fragment sequence':'first', ':TYPE':set,
                                                                         'Genomic_location':lambda x: set(x.dropna())
                                                                            if x.dropna().any() else set()}).reset_index()
tRF_tRNA2['Species'] = 'Homo sapiens'
tRF_tRNA2[':ID'] = "https://cm.jefferson.edu/MINTbase/InputController?v=g&g=GRCh37&fn=" + tRF_tRNA2['License Plate (sequence derived)']
tRF_tRNA2[':TYPE'] = tRF_tRNA2[':TYPE'].apply(lambda items: [i for i in items]).apply(json.dumps)
tRF_tRNA2 = tRF_tRNA2.rename(columns={'License Plate (sequence derived)':'Label','Fragment sequence':'Sequence'})
tRF_tRNA2['MINTbase_ID'] = tRF_tRNA2['Label']
tRF_tRNA2.to_pickle(unprocessed_property_data_location + 'mintbase.pkl')
tRF_tRNA2.head(n=3)

* TBDB

In [None]:
riboswitch_protein = pd.read_csv(unprocessed_data_location+'tbdb.csv', sep=',') 
riboswitch_protein['Genomic_location'] = "chr:" + riboswitch_protein['locus_start'].astype(str) + "-" + riboswitch_protein['locus_end'].astype(str)
riboswitch_protein = riboswitch_protein[['accession_url','FASTA_sequence','unique_name','GBSeq_organism',
                                         'Genomic_location','Structure','accession_name']].rename(
                                             columns={'accession_url':':ID','FASTA_sequence':'Sequence','unique_name':'TBDB_ID',
                                                      'GBSeq_organism':'Species','accession_name':'Label'})
riboswitch_protein['Sequence'] = riboswitch_protein['Sequence'].str.replace("T","U")
riboswitch_protein[':TYPE'] = [["RNA", 'bacterial_RNA', "ncRNA", "riboswitch", "T-box_riboswitch"]] * len(riboswitch_protein)
riboswitch_protein = riboswitch_protein.explode(':TYPE')
riboswitch_protein = riboswitch_protein.groupby([':ID']).agg({'Sequence':'first','TBDB_ID':'first','Species':'first','Genomic_location':
                                                              lambda x: set(x.dropna()) if x.dropna().any() else set(),'Structure':'first',
                                                              'Label':'first',':TYPE':set}).reset_index()
riboswitch_protein[':TYPE'] = riboswitch_protein[':TYPE'].apply(lambda items: [i for i in items]).apply(json.dumps)
riboswitch_protein.to_pickle(unprocessed_property_data_location + 'tbdb.pkl')
riboswitch_protein.head(n=2)

* RSwitch

In [None]:
riboswitch_bactStrain = pd.read_csv(unprocessed_data_location + 'rswitch.csv', header=None) 
riboswitch_bactStrain.rename(columns={0:'ID', 1:':TYPE', 2:'Species'},inplace=True)
print(riboswitch_bactStrain[':TYPE'].unique())
riboswitch_bactStrain[':TYPE'] = riboswitch_bactStrain[':TYPE'].str.replace('Sitemap:', 'nan')
riboswitch_bactStrain[':TYPE'] = riboswitch_bactStrain[':TYPE'].fillna('nan')
riboswitch_bactStrain[':TYPE'] = "RNA, bacterial_RNA, ncRNA, riboswitch, " + riboswitch_bactStrain[':TYPE'].astype(str).str.replace(" ", "_")
riboswitch_bactStrain[':TYPE'] = riboswitch_bactStrain[':TYPE'].str.replace(', nan', '')
riboswitch_bactStrain[':ID'] = "https://penchovsky.atwebpages.com/applications.php?page=58?" + riboswitch_bactStrain['ID']
riboswitch_bactStrain[':TYPE'] = riboswitch_bactStrain[':TYPE'].str.split(", ")
riboswitch_bactStrain = riboswitch_bactStrain.explode(':TYPE')
riboswitch_bactStrain = riboswitch_bactStrain.groupby([':ID']).agg({'ID':'first','Species':'first',':TYPE':set}).reset_index()
riboswitch_bactStrain[':TYPE'] = riboswitch_bactStrain[':TYPE'].apply(lambda items: [i for i in items]).apply(json.dumps)
riboswitch_bactStrain.to_pickle(unprocessed_property_data_location + 'rswitch.pkl')
riboswitch_bactStrain.head(n=3)

* Apta-Index

In [None]:
aptamer_protein = pd.read_csv(unprocessed_data_location + 'aptaindex.csv',names=['Label', 'ID', 'Target', 'Sequence'],skiprows=[0]) 
aptamer_protein.Target = aptamer_protein.Target.str.lower()
aptamer_protein['ID'] = 'aptamer-details/?id=' + aptamer_protein['ID'].astype(str)
aptamer_protein = aptamer_protein.drop(columns=['Target'])
aptamer_protein = aptamer_protein[['ID','Sequence']]
aptamer_protein['Sequence'] = aptamer_protein['Sequence'].str.replace('[^ATCGU]', '', regex=True)
aptamer_protein['Sequence'] = aptamer_protein['Sequence'].str.replace('T', 'U')
aptamer_protein['Sequence'] = aptamer_protein['Sequence'].fillna('nan')
aptamer_protein['Sequence'] = aptamer_protein['Sequence'].replace('', 'nan', regex=True)
aptamer_protein[':TYPE'] = [["RNA", "ncRNA", "sncRNA", 'oligo', "aptamer", "RNA_aptamer"]] * len(aptamer_protein)
aptamer_protein = aptamer_protein.explode(':TYPE')
aptamer_protein = aptamer_protein.groupby(['ID']).agg({'Sequence':lambda x: x.dropna().iloc[0] if not x.dropna().empty else np.nan, ':TYPE':set}).reset_index()
aptamer_protein[':TYPE'] = aptamer_protein[':TYPE'].apply(lambda items: [i for i in items]).apply(json.dumps)
aptamer_protein[':ID'] = "https://www.aptagen.com/" + aptamer_protein['ID']
aptamer_protein['Apta-Index_ID'] = aptamer_protein['ID'].str.replace("aptamer-details/?id=","")
aptamer_protein.drop(columns=['ID'],inplace=True)
aptamer_protein.to_pickle(unprocessed_property_data_location + 'aptaindex.pkl')
aptamer_protein.head(n=3)

* snoDB

In [None]:
snoDB = pd.read_csv(unprocessed_data_location + 'download_all', sep="\t")
snoDB = snoDB[['rna_central_id','host_gene_id','rrna_targets','snrna_targets','lncrna_targets','protein_coding_targets','snorna_targets',
               'mirna_targets','trna_targets','ncrna_targets','pseudogene_targets','other_targets','is_expressed']]
snoDB = snoDB[snoDB['rna_central_id'].notna()]
snoDB = snoDB.rename(columns={'rna_central_id':':START_ID'})
for col in snoDB.columns:
    snoDB[col] = snoDB[col].astype(str).str.split(';')
for col in snoDB.columns:
    snoDB = snoDB.explode(col)

snoRNA_rRNA = snoDB[['rrna_targets']].drop_duplicates().rename(columns={'rrna_targets':'Label'})
snoRNA_rRNA = snoRNA_rRNA[snoRNA_rRNA['Label'] != 'nan']
snoRNA_rRNA['Label'] = snoRNA_rRNA['Label'].str.split('.').str[0].str.strip()
snoRNA_rRNA[':TYPE'] = [['RNA', 'ncRNA', 'rRNA']] * len(snoRNA_rRNA)
snoRNA_rRNA[':TYPE'] = snoRNA_rRNA[':TYPE'].apply(lambda items: [i for i in items]).apply(json.dumps)
snoRNA_rRNA[':ID'] = "http://scottgroup.med.usherbrooke.ca/snoDB?" + snoRNA_rRNA['Label']
snoRNA_rRNA['Species'] = 'Homo sapiens'
snoRNA_rRNA.to_pickle(unprocessed_property_data_location + 'snodb.pkl')
snoRNA_rRNA.head(n=3)

***
# Genome

* ViroidDB

In [None]:
vRNA_ribozyme = pd.read_json(unprocessed_data_location + 'all.json').T 

# Extract ribozymes 
myre = re.compile(r"\n>> .*?\n")
ribozyme = [myre.findall(i) for i in vRNA_ribozyme.ribozymes]
ribozyme = [[j.replace("\n",'').replace(">> ",'') for j in i] for i in ribozyme]

rnacentral_map_rfam = pd.read_csv(processed_data_location + "RNAcentral_MAP/rfam.tsv",sep='\t',
                                     names=['RNAcentral ID', 'DB', 'Rfam ID', 'Organism', 'RNA category', "nan"]).drop(columns="nan")
rnacentral_map_human_rfam = rnacentral_map_rfam[rnacentral_map_rfam['Organism'] == 9606].drop(
    columns=['Organism', 'DB', 'RNA category'])

vRNA_ribozyme = pd.concat([vRNA_ribozyme.reset_index().drop(columns=['index']), # Genome --> NCBI nuccore 
                           pd.Series(ribozyme)], axis=1)
vRNA_ribozyme = vRNA_ribozyme.explode(0)
vRNA_ribozyme[0] = vRNA_ribozyme[0].str.split().str[0]
vRNA_ribozyme.drop(columns=['isolationSource','collectionDate','gc','bioSample','identicalSeqs','genBankTitle','displayTitle',
                            'length','sequenceType','nucCompleteness','genotype','segment','moleculeType','publications',
                           'geoLocation','country','usa','submitters','releaseDate','isolate',
                            'sequence','type','Cls_ID80','genus','family','ribozymes',
                            'Cls_ID70','Cls_ID85','Cls_ID75','Cls_ID95','Cls_ID90','sraAccession','submitters','host'],
                   inplace=True)
vRNA_ribozyme.insert(0,1,vRNA_ribozyme.pop(0))
vRNA_ribozyme['accession'] = vRNA_ribozyme['accession'].str.split(".").str[0]

print(vRNA_ribozyme.species.unique()[:3])
# Among them, only Hepatitis delta virus (NCBI taxid: 12475) is a human pathogen
vRNA_ribozyme = vRNA_ribozyme[vRNA_ribozyme.species == 'Hepatitis delta virus']
rnacentral_map_rfam_delta = rnacentral_map_rfam[rnacentral_map_rfam['Organism'] == 12475]
ribozyme_rfam_map = pd.read_csv(processed_data_location + 'ribozyme_RFAM_MAP.txt', header=None, sep='\t')

vRNA_ribozyme = pd.merge(ribozyme_rfam_map,vRNA_ribozyme,left_on=0,right_on=1).drop(columns=['1_y'])
vRNA_ribozyme = pd.merge(vRNA_ribozyme.rename(columns={'1_x':'Rfam ID'}),rnacentral_map_rfam_delta[['RNAcentral ID','Rfam ID']].drop_duplicates(),
                         on='Rfam ID').drop(columns=['Rfam ID',0,'species'])

vRNA_ribozyme.rename(columns={'RNAcentral ID':':START_ID','accession':':END_ID', 'structure':'Structure'},inplace=True)
vRNA_ribozyme.Structure = vRNA_ribozyme.Structure.astype(str).str.split("'minus': {'dbn': '").str[1].str.split("'").str[0]
vRNA_ribozyme['Source'] = 'ViroidDB'
vRNA_ribozyme = vRNA_ribozyme[[':END_ID','Structure']].drop_duplicates(subset=[':END_ID'],keep='first').rename(columns={':END_ID':'NCBI_ID'})
vRNA_ribozyme[':ID'] = "https://www.ncbi.nlm.nih.gov/nuccore/" + vRNA_ribozyme['NCBI_ID']
vRNA_ribozyme[':TYPE'] = [['Genome', 'Viral_genome', 'RNA_genome', 'ss-RNA']] * len(vRNA_ribozyme)
vRNA_ribozyme['Species'] = "Hepatitis delta virus"
vRNA_ribozyme['Description'] = "Hepatitis delta virus, complete genome"
vRNA_ribozyme['Label'] = "Complete genome " + vRNA_ribozyme['NCBI_ID']
vRNA_ribozyme[':TYPE'] = vRNA_ribozyme[':TYPE'].apply(lambda items: [i for i in items]).apply(json.dumps)
vRNA_ribozyme.head(n=3)

In [None]:
def fetch_sequence(accession, api_key=None):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "nuccore",
        "id": accession,
        "rettype": "fasta",
        "retmode": "text"
    }
    if api_key:
        params["api_key"] = api_key
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        return response.text
    else:
        return np.nan

vRNA_ribozyme['Sequence'] = vRNA_ribozyme['NCBI_ID'].apply(lambda acc: fetch_sequence(acc, api_key='8d88dc3d63cd73854f0034baa217b05a9808'))
vRNA_ribozyme['Sequence'] = vRNA_ribozyme['Sequence'].str.upper().str.strip().str.split("GENOME").str[1].str.replace("T","U").str.replace("\n","").str.strip()
vRNA_ribozyme['Sequence'] = vRNA_ribozyme['Sequence'].replace("nan",np.nan)
vRNA_ribozyme.to_pickle(unprocessed_property_data_location + 'genome.pkl')
vRNA_ribozyme.head(n=2)

***
## Chemical modification

In [None]:
tRNA_mod = pd.read_csv(edge_data_location+'modification-tRNA2314.txt', sep='\t')[['Modification']].drop_duplicates().dropna()
tRNA_mod = tRNA_mod.rename(columns={'Modification':'Label'})
tRNA_mod['Modomics_ID'] = tRNA_mod['Label'].str.strip().str.replace(r'\s+', '_', regex=True)
tRNA_mod[':ID'] = "https://genesilico.pl/modomics?" + tRNA_mod['Modomics_ID']
tRNA_mod[':TYPE'] = [["Genomic_feature", "Epigenetic_modification"]] * len(tRNA_mod)
tRNA_mod[':TYPE'] = tRNA_mod[':TYPE'].apply(json.dumps)
tRNA_mod.to_pickle(unprocessed_property_data_location + 'chemicalModification.pkl')
tRNA_mod.head(n=3)

***
# Biological role

In [None]:
id = ['https://www.genome.gov/genetics-glossary/Tumor-Suppressor-Gene', 'https://www.genome.gov/genetics-glossary/Oncogene', 'https://www.genome.gov/genetics-glossary/General']
name = ['Tumor-Suppressor-Gene', 'Oncogene', 'General']
label = ['Tumor Suppressor Gene', 'Oncogene', 'General']
definition = ['A tumor suppressor gene encodes a protein that acts to regulate cell division, keeping it in check. When a tumor suppressor gene is inactivated by a mutation, the protein it encodes is not produced or does not function properly, and as a result, uncontrolled cell division may occur. Such mutations may contribute to the development of a cancer.\
              Tumor Suppressor Gene. Tumor suppressor genes are present in all cells in our body. When they are switched on, they prevent ourselves from growing and dividing. You can think of them as being like the brakes of a car. However, when a tumor suppressor gene is switched off, either because the cell mistakenly deletes it or mutates it, the brake is released and the cell may start to grow and divide uncontrollably and potentially drive the cell to turn into a cancer cell.',
              'An oncogene is a mutated gene that has the potential to cause cancer. Before an oncogene becomes mutated, it is called a proto-oncogene, and it plays a role in regulating normal cell division. Cancer can arise when a proto-oncogene is mutated, changing it into an oncogene and causing the cell to divide and multiply uncontrollably. Some oncogenes work like an accelerator pedal in a car, pushing a cell to divide again and again. Others work like a faulty brake in a car parked on a hill, also causing the cell to divide unchecked.\
                Oncogene. The name of oncogene suggests it is a gene that can cause cancer. Initially, oncogenes were identified in viruses, which could cause cancers in animals. Later, it was found that oncogenes can be mutated copies of certain normal cellular genes also called proto-oncogenes. Intact proto-oncogenes play important functions, regulating normal cellular growth, division, and apoptosis, which is the name for programmed or controlled cell death. Oncogenes or mutated copies of the proto-oncogenes may lead to uncontrolled cell growth and the escape from cell death, which may result in cancer development.',
              np.nan]
role = pd.DataFrame({':ID': id, 'NIH_ID': name, 'Description': definition, 'Label': label}) 
role[':TYPE'] = [["Biological_role"]] * len(role)
role[':TYPE'] = role[':TYPE'].apply(json.dumps)
role.to_pickle(unprocessed_property_data_location + 'biologicalRole.pkl')
role

***
# Small protein

* SmProt

In [None]:
lncRNA_protein = pd.read_csv(unprocessed_data_location + 'sprotein_LncBook2.0.csv.gz') 
lncRNA_protein['SmProt Protein Sequence'] = lncRNA_protein['SmProt Protein Sequence'].str.replace('*', '', regex=False)
lncRNA_protein = lncRNA_protein[['SmProt ID','SmProt Protein Sequence']].rename(columns={'SmProt ID':'SmProt_ID','SmProt Protein Sequence':'Sequence'})
lncRNA_protein[':ID'] = "http://bigdata.ibp.ac.cn/SmProt/SmProt.php?ID=" + lncRNA_protein['SmProt_ID']
lncRNA_protein = lncRNA_protein.drop_duplicates(subset=[':ID'],keep='first')
lncRNA_protein[':TYPE'] = [["Protein", "Human_protein", "Small_protein"]] * len(lncRNA_protein)
lncRNA_protein[':TYPE'] = lncRNA_protein[':TYPE'].apply(json.dumps)
lncRNA_protein['Species'] = 'Homo sapiens'
lncRNA_protein.to_pickle(unprocessed_property_data_location + 'smprot.pkl')
lncRNA_protein.head(n=3)

* cncRNAdb

In [None]:
RNA_anatomy = pd.read_excel(unprocessed_data_location + 'Translated ncRNA.xlsx')
RNA_anatomy = RNA_anatomy[RNA_anatomy.Organism.str.contains('apiens')]
RNA_anatomy = RNA_anatomy[RNA_anatomy['Gene.ID'].notna()]
RNA_anatomy = RNA_anatomy[RNA_anatomy.Notes != 'It has been re-annotated as protein coding gene now']
RNA_anatomy = RNA_anatomy[['cncRNAdb.ID','Peptide']].rename(columns={'cncRNAdb.ID':'cncRNAdb_ID','Peptide':'Sequence'})
RNA_anatomy[':ID'] = "https://www.rna-society.org/cncrnadb?" + RNA_anatomy['cncRNAdb_ID']
RNA_anatomy = RNA_anatomy.drop_duplicates(subset=[':ID'],keep='first')
RNA_anatomy[':TYPE'] = [["Protein", "Human_protein", "Small_protein"]] * len(RNA_anatomy)
RNA_anatomy[':TYPE'] = RNA_anatomy[':TYPE'].apply(json.dumps)
RNA_anatomy['Species'] = 'Homo sapiens'
RNA_anatomy.to_pickle(unprocessed_property_data_location + 'cncrnadb.pkl')
RNA_anatomy.head(n=3)

***
# Reactome

In [None]:
reactome_pathways = pd.read_csv(unprocessed_data_location + 'ReactomePathways.txt', header=None, delimiter='\t', low_memory=False)
reactome_pathways = reactome_pathways.loc[reactome_pathways[2].apply(lambda x: x == 'Homo sapiens')].drop(columns=[2])
reactome_pathways[':TYPE'] = [["Pathway"]] * len(reactome_pathways)
reactome_pathways[':TYPE'] = reactome_pathways[':TYPE'].apply(json.dumps)
reactome_pathways['Species'] = 'Homo sapiens'
reactome_pathways[':ID'] = "https://reactome.org/content/detail/" + reactome_pathways[0]
reactome_pathways = reactome_pathways.rename(columns={0:'Reactome_ID',1:'Label'})
reactome_pathways.to_pickle(unprocessed_property_data_location + 'reactome.pkl')
reactome_pathways.head(n=3)

***
# Wikipathways

In [None]:
import io
with open(unprocessed_data_location+'wpw_reactome.csv', 'r') as file:
    data = file.read().rstrip()
    
desc_wpw_map = pd.DataFrame([ ln.rstrip().split('\t') for ln in
    io.StringIO(data).readlines() ]).fillna('')

desc_wpw_map = desc_wpw_map[[0,1]]
desc_wpw_map[':TYPE'] = 'Pathway'
desc_wpw_map[0] = desc_wpw_map[0].astype(str).str.replace(r'%WikiPathways_20240410%WP.*%Homo sapiens','',regex=True)

desc_wpw_map[':TYPE'] = [["Pathway"]] * len(desc_wpw_map)
desc_wpw_map[':TYPE'] = desc_wpw_map[':TYPE'].apply(json.dumps)
desc_wpw_map['Species'] = 'Homo sapiens'
desc_wpw_map['WikiPathways_ID'] = desc_wpw_map[1].str.split("/").str[-1]
desc_wpw_map = desc_wpw_map.rename(columns={0:'Label',1:':ID'})
desc_wpw_map.to_pickle(unprocessed_property_data_location + 'wikipathways.pkl')
desc_wpw_map.head(n=3)

***
# SNP

In [None]:
variant_data = pd.read_csv(unprocessed_data_location + 'variant_summary.txt', header=0, delimiter='\t', low_memory=False)

variant_data = variant_data[(variant_data['Assembly'] == 'GRCh38') & (variant_data['RS# (dbSNP)'] != -1)]
variant_data['RS# (dbSNP)'] = 'rs' + variant_data['RS# (dbSNP)'].astype(str)
variant_data['Genomic_location'] = "chr"+variant_data['Chromosome'].astype(str)+":"+variant_data['Start'].astype(str)+"-"+variant_data['Stop'].astype(str)
variant_data['Mutation'] = variant_data['ReferenceAlleleVCF'] + ">" + variant_data['AlternateAlleleVCF'] 
variant_data = variant_data[["RS# (dbSNP)","Type","Genomic_location","Mutation"]]

print(variant_data.Type.unique())
variant_data.Type = variant_data.Type.replace('single nucleotide variant', 'Variant, SNP, SNV')
variant_data.Type = variant_data.Type.replace('Indel', 'Variant, SNP, Indel')
variant_data.Type = variant_data.Type.replace('Deletion', 'Variant, SNP, Deletion')
variant_data.Type = variant_data.Type.replace('Duplication', 'Variant, SNP, Duplication')
variant_data.Type = variant_data.Type.replace('Microsatellite', 'Variant, SNP')
variant_data.Type = variant_data.Type.replace('Insertion', 'Variant, SNP, Insertion')
variant_data.Type = variant_data.Type.replace('Variation', 'Variant, SNP')
variant_data.Type = variant_data.Type.replace('Inversion', 'Variant, SNP, Inversion')
print(variant_data.Type.unique())

variant_data = variant_data.groupby(['RS# (dbSNP)']).agg({'Mutation':'first', 'Genomic_location':lambda x: set(x.dropna()) if x.dropna().any() else set(),
                                                          'Type':'first'}).reset_index()
variant_data[':TYPE'] = variant_data['Type'].str.split(", ").apply(json.dumps)
variant_data[':ID'] = "https://www.ncbi.nlm.nih.gov/snp/" + variant_data['RS# (dbSNP)']
variant_data['Label'] = variant_data['RS# (dbSNP)']
variant_data = variant_data[[':ID','Label','Mutation','Genomic_location',':TYPE']]
variant_data['NCBI_ID'] = variant_data['Label']
variant_data['Genomic_location'] = variant_data['Genomic_location'].apply(lambda items: [i for i in items]).apply(json.dumps)
variant_data.to_pickle(unprocessed_property_data_location + 'snp.pkl')
variant_data.head(n=3)

***
# COSMIC

In [None]:
cosmic = pd.read_csv(unprocessed_data_location + "Cosmic_NonCodingVariants_Normal_v101_GRCh38.vcf",
                     sep="\t", comment="#", names=['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO'], low_memory=False)
cosmic = cosmic[cosmic['ID'] != "."]
cosmic['Mutation'] = cosmic['REF'] + '>' + cosmic['ALT']
cosmic['INFO'] = cosmic['INFO'].str.split(';')
info_dicts = cosmic['INFO'].apply(lambda items: dict(part.split('=', 1) for part in items if '=' in part))
info_df = pd.json_normalize(info_dicts)
cosmic = pd.concat([cosmic, info_df], axis=1)
cosmic = cosmic[cosmic['IS_CANONICAL'] == 'y']
cosmic['Genomic_location'] = "chr" + cosmic['CHROM'].astype(str) + ':' + cosmic['POS'].astype(str) + "-" +\
    (cosmic['POS'].astype(int) + cosmic['REF'].str.len()).astype(str) + cosmic['STRAND'].astype(str)

print(cosmic.SO_TERM.unique())
cosmic.SO_TERM = cosmic.SO_TERM.replace('SNV', 'Variant, Somatic_variant, SNV')
cosmic.SO_TERM = cosmic.SO_TERM.replace('indel', 'Variant, Somatic_variant, Indel')
cosmic.SO_TERM = cosmic.SO_TERM.replace('deletion', 'Variant, Somatic_variant, Deletion')
cosmic.SO_TERM = cosmic.SO_TERM.replace('duplication', 'Variant, Somatic_variant, Duplication')
cosmic.SO_TERM = cosmic.SO_TERM.replace('microsatellite', 'Variant, Somatic_variant')
cosmic.SO_TERM = cosmic.SO_TERM.replace('insertion', 'Variant, Somatic_variant, Insertion')
cosmic.SO_TERM = cosmic.SO_TERM.replace('variation', 'Variant, Somatic_variant')
cosmic.SO_TERM = cosmic.SO_TERM.replace('inversion', 'Variant, Somatic_variant, Inversion')
print(cosmic.SO_TERM.unique())

cosmic = cosmic.groupby(['ID']).agg({'Mutation':'first', 'Genomic_location':lambda x: set(x.dropna()) if x.dropna().any() else set(),
                                     'SO_TERM':'first'}).reset_index()
cosmic[':TYPE'] = cosmic['SO_TERM'].str.split(", ").apply(json.dumps)
cosmic[':ID'] = "https://cancer.sanger.ac.uk/cosmic/mutation/overview?id=" + cosmic['ID']
cosmic['Label'] = cosmic['ID']
cosmic = cosmic[[':ID','Label','Mutation','Genomic_location',':TYPE']]
cosmic['Species'] = 'Homo sapiens'
cosmic['COSMIC_ID'] = cosmic['Label']
cosmic['Genomic_location'] = cosmic['Genomic_location'].apply(lambda items: [i for i in items]).apply(json.dumps)
cosmic.to_pickle(unprocessed_property_data_location + 'cosmic.pkl')
cosmic.head(n=3)

***
# Gene

In [None]:
merged_data_clean = pd.read_csv(processed_data_location + 'Merged_Human_Ensembl_Entrez_HGNC_Uniprot_Identifiers.txt',
                                sep='\t', dtype={'entrez_id':str})[['ensembl_gene_id', 'symbol', 'ensembl_gene_type',
                                                                    'entrez_id', 'synonyms']]
merged_data_clean = merged_data_clean[(merged_data_clean['entrez_id'].astype(str).str.match(r'^\d'))].drop_duplicates()

ensembl_geneset = pd.read_csv(unprocessed_data_location + 'Homo_sapiens.GRCh38.113.gtf',
                                  header = None, delimiter='\t', skiprows=5, low_memory=False)
ensembl_geneset = ensembl_geneset[ensembl_geneset[2] == 'gene']
ensembl_geneset[8] = ensembl_geneset[8].str.split("gene_id \"").str[1].str.split("\"").str[0]
ensembl_geneset['Genomic_location'] = "chr" + ensembl_geneset[0].astype(str)+":"+ensembl_geneset[3].astype(str)+"-"+\
      ensembl_geneset[4].astype(str) + ensembl_geneset[6].astype(str)
ensembl_geneset = ensembl_geneset[[8, "Genomic_location"]]

merged_data_clean = merged_data_clean.merge(ensembl_geneset, left_on='ensembl_gene_id', right_on=[8], how='left').drop(columns=[8])
merged_data_clean.head(n=3)

In [None]:
print(merged_data_clean.ensembl_gene_type.unique())
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace(np.nan, 'Gene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('protein-coding', 'Gene, Protein_coding_gene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('unknown', 'Gene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('TR_C_gene', 'Gene, Protein_coding_gene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('miRNA', 'Gene, Non_coding_gene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('lncRNA', 'Gene, Non_coding_gene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('snoRNA', 'Gene, Non_coding_gene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('processed_pseudogene', 'Pseudogene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('unprocessed_pseudogene', 'Pseudogene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('transcribed_processed_pseudogene', 'Pseudogene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('transcribed_unitary_pseudogene', 'Pseudogene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('unitary_pseudogene', 'Pseudogene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('Mt_tRNA', 'Gene, Non_coding_gene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('rRNA_pseudogene', 'Pseudogene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('scaRNA', 'Gene, Non_coding_gene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('snRNA', 'Gene, Non_coding_gene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('scRNA', 'Gene, Non_coding_gene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('vaultRNA', 'Gene, Non_coding_gene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('rRNA', 'Gene, Non_coding_gene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('ribozyme', 'Gene, Non_coding_gene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('transcribed_unprocessed_pseudogene', 'Gene, Non_coding_gene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('miscRNA', 'Gene')
merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.replace('artifact', 'Gene')
print(merged_data_clean.ensembl_gene_type.unique())

merged_data_clean.ensembl_gene_type = merged_data_clean.ensembl_gene_type.str.split(", ")
merged_data_clean = merged_data_clean.explode('ensembl_gene_type')
merged_data_clean.head(n=3)

In [None]:
from tqdm import tqdm

def get_sequence(ensembl_gene_id):
    base_url = "https://rest.ensembl.org/sequence/id/"
    response = requests.get(base_url + ensembl_gene_id, headers={"Content-Type": "application/json"})
    if response.ok:
        return response.json()['seq']
    else:
        return None

ensembl_id = pd.DataFrame(merged_data_clean['ensembl_gene_id'].unique())
ensembl_id['Sequence'] = [get_sequence(x) for x in tqdm(ensembl_id[0])]
ensembl_id['Sequence'] = ensembl_id['Sequence'].str.upper()
ensembl_id.to_csv(unprocessed_property_data_location + 'ensembl_sequences.csv', index=None)
#ensembl_id = pd.read_csv(unprocessed_property_data_location + 'ensembl_sequences.csv')
merged_data_clean = merged_data_clean.merge(ensembl_id, left_on='ensembl_gene_id', right_on='0', how='left').drop(columns=['0'])
merged_data_clean.head(n=3)

In [None]:
merged_data_clean = merged_data_clean.groupby(['entrez_id']).agg({'synonyms':lambda x: set(x.dropna()) if x.dropna().any() else set(),
                                                                  "ensembl_gene_id":lambda x: x.dropna().iloc[0]
                                                                    if not x.dropna().empty else np.nan,
                                                                  'ensembl_gene_type':set,
                                                                  'Genomic_location':lambda x: set(x.dropna()) if x.dropna().any() else set(),
                                                                  "Sequence":lambda x: x.dropna().iloc[0] if not x.dropna().empty else np.nan,
                                                                  'symbol':lambda x: x.dropna().iloc[0] if not x.dropna().empty else np.nan})\
                                                                    .reset_index()

merged_data_clean[':ID'] = "http://www.ncbi.nlm.nih.gov/gene/" + merged_data_clean['entrez_id'].astype(str)
merged_data_clean = merged_data_clean[merged_data_clean[':ID'] != 'http://www.ncbi.nlm.nih.gov/gene/nan']
merged_data_clean['synonyms'] = merged_data_clean['synonyms'].apply(lambda items: [i for i in items]).apply(json.dumps)
merged_data_clean['ensembl_gene_type'] = merged_data_clean['ensembl_gene_type'].apply(lambda items: [i for i in items]).apply(json.dumps)
merged_data_clean['Genomic_location'] = merged_data_clean['Genomic_location'].apply(lambda items: [i for i in items]).apply(json.dumps)
merged_data_clean = merged_data_clean.rename(columns={'entrez_id':'NCBI_ID', 'ensembl_gene_id': 'Ensembl_ID',
                                  'symbol': 'Symbol', 'synonyms':'Synonym', 'ensembl_gene_type':':TYPE'})
merged_data_clean.to_pickle(unprocessed_property_data_location + 'gene.pkl')
merged_data_clean.head(n=2)

***

<!-- We keep db entities for which we processed properties only if they are within nodes coming from linked open data edges.

In [None]:
rnacentral = pd.read_pickle(unprocessed_property_data_location + "RNAcentral.pkl")
rnacentral['KG_ID'] = rnacentral['RNAcentral_ID']
ensembl = pd.read_pickle(unprocessed_property_data_location + "ensembl.pkl")
ensembl['KG_ID'] = ensembl['Ensembl_ID']
addgene = pd.read_pickle(unprocessed_property_data_location + "addgene.pkl")
addgene['KG_ID'] = addgene[':ID'].str.replace("https://","")
icbp = pd.read_pickle(unprocessed_property_data_location + "icbp.pkl")
icbp['KG_ID'] = icbp[':ID'].str.replace("http://web.mit.edu/sirna/sequences/results-","")
circbase = pd.read_pickle(unprocessed_property_data_location + "circbase.pkl")
circbase['KG_ID'] = circbase['circBase_ID']
eskipFinder = pd.read_pickle(unprocessed_property_data_location + "eskipFinder.pkl")
eskipFinder['KG_ID'] = eskipFinder['Label'].str.strip().str.replace(' ', '')
tsrfun = pd.read_pickle(unprocessed_property_data_location + "tsrfun.pkl")
tsrfun['KG_ID'] = tsrfun['tsRFun_ID']
trfdb = pd.read_pickle(unprocessed_property_data_location + "trfdb.pkl")
trfdb['Genomic_location'] = trfdb['Genomic_location'].apply(lambda items: [i for i in items]).apply(json.dumps)
trfdb['KG_ID'] = "trfdb?" + trfdb['tRF_ID']
mintbase = pd.read_pickle(unprocessed_property_data_location + "mintbase.pkl")
mintbase['Genomic_location'] = mintbase['Genomic_location'].apply(lambda items: [i for i in items]).apply(json.dumps)
mintbase['KG_ID'] = mintbase['MINTbase_ID']
tbdb = pd.read_pickle(unprocessed_property_data_location + "tbdb.pkl")
tbdb['KG_ID'] = tbdb[':ID']
rswitch = pd.read_pickle(unprocessed_property_data_location + "rswitch.pkl")
rswitch.rename(columns={'ID':'RSwitch_ID'},inplace=True)
rswitch['KG_ID'] = rswitch['RSwitch_ID']
aptaindex = pd.read_pickle(unprocessed_property_data_location + "aptaindex.pkl")
aptaindex['KG_ID'] = aptaindex[':ID'].str.replace("https://www.aptagen.com/","")
snodb = pd.read_pickle(unprocessed_property_data_location + "snodb.pkl")
snodb['KG_ID'] = snodb['Label']
genome = pd.read_pickle(unprocessed_property_data_location + "genome.pkl")
genome['KG_ID'] = genome['NCBI_ID']
chemicalModification = pd.read_pickle(unprocessed_property_data_location + "chemicalModification.pkl")
chemicalModification['KG_ID'] = chemicalModification['Modomics_ID']
biologicalRole = pd.read_pickle(unprocessed_property_data_location + "biologicalRole.pkl")
biologicalRole['KG_ID'] = biologicalRole['NIH_ID']
smprot = pd.read_pickle(unprocessed_property_data_location + "smprot.pkl")
smprot['KG_ID'] = smprot['SmProt_ID']
cncrnadb = pd.read_pickle(unprocessed_property_data_location + "cncrnadb.pkl")
cncrnadb['KG_ID'] = cncrnadb['cncRNAdb_ID']
reactome = pd.read_pickle(unprocessed_property_data_location + "reactome.pkl")
reactome['KG_ID'] = reactome['Reactome_ID']
wikipathways = pd.read_pickle(unprocessed_property_data_location + "wikipathways.pkl")
wikipathways['KG_ID'] = wikipathways['WikiPathways_ID']
snp = pd.read_pickle(unprocessed_property_data_location + "snp.pkl")
snp['KG_ID'] = snp['NCBI_ID']
cosmic = pd.read_pickle(unprocessed_property_data_location + "cosmic.pkl")
cosmic['Genomic_location'] = cosmic['Genomic_location'].str.replace(".0","")
cosmic['KG_ID'] = cosmic['COSMIC_ID']
gene = pd.read_pickle(unprocessed_property_data_location + "gene.pkl")
gene['KG_ID'] = gene['NCBI_ID']
gene['Species'] = 'Homo sapiens'
db_entities = pd.concat([rnacentral,ensembl,addgene,icbp,circbase,eskipFinder,tsrfun,trfdb,mintbase,
                         tbdb,rswitch,aptaindex,snodb,genome,chemicalModification,biologicalRole,
                         smprot,cncrnadb,reactome,wikipathways,snp,cosmic,gene])

db_entities['Label'] = db_entities['Label'].fillna(db_entities['Symbol'])
db_entities = db_entities.drop(columns=['Symbol'])
db_entities['ID'] = "RNAcentral:" + db_entities['RNAcentral_ID'] + "_9606"
db_entities['Ensembl_ID'] = "Ensembl:" + db_entities['Ensembl_ID']
db_entities['Addgene_ID'] = "Addgene:" + db_entities['Addgene_ID']
db_entities['ICBP_ID'] = "ICBP:" + db_entities['ICBP_ID']
db_entities['circBase_ID'] = "circBase:" + db_entities['circBase_ID']
db_entities['tsRFun_ID'] = "tsRFun:" + db_entities['tsRFun_ID']
db_entities['tRF_ID'] = "tRF:" + db_entities['tRF_ID']
db_entities['MINTbase_ID'] = "MINTbase:" + db_entities['MINTbase_ID']
db_entities['TBDB_ID'] = "TBDB:" + db_entities['TBDB_ID']
db_entities['RSwitch_ID'] = "RSwitch:" + db_entities['RSwitch_ID']
db_entities['Apta-Index_ID'] = "Apta-Index:" + db_entities['Apta-Index_ID']
db_entities['NCBI_ID'] = "NCBI:" + db_entities['NCBI_ID']
db_entities['Modomics_ID'] = "Modomics:" + db_entities['Modomics_ID']
db_entities['NIH_ID'] = "NIH:" + db_entities['NIH_ID']
db_entities['SmProt_ID'] = "SmProt:" + db_entities['SmProt_ID']
db_entities['cncRNAdb_ID'] = "cncRNAdb:" + db_entities['cncRNAdb_ID']
db_entities['Reactome_ID'] = "Reactome:" + db_entities['Reactome_ID']
db_entities['WikiPathways_ID'] = "WikiPathways:" + db_entities['WikiPathways_ID']
db_entities['COSMIC_ID'] = "COSMIC:" + db_entities['COSMIC_ID']

db_entities['ID'] = db_entities['ID'].fillna(db_entities['Ensembl_ID']).fillna(db_entities['Addgene_ID']).fillna(
    db_entities['ICBP_ID']).fillna(db_entities['circBase_ID']).fillna(db_entities['tsRFun_ID']).fillna(
        db_entities['tRF_ID']).fillna(db_entities['MINTbase_ID']).fillna(db_entities['TBDB_ID']).fillna(
            db_entities['RSwitch_ID']).fillna(db_entities['Apta-Index_ID']).fillna(
                db_entities['NCBI_ID']).fillna(db_entities['Modomics_ID']).fillna(
                    db_entities['NIH_ID']).fillna(db_entities['SmProt_ID']).fillna(
                        db_entities['Reactome_ID']).fillna(
                        db_entities['WikiPathways_ID']).fillna(
                        db_entities['cncRNAdb_ID']).fillna(
                        db_entities['COSMIC_ID'])

db_entities = db_entities.drop(columns=['Ensembl_ID','Addgene_ID','ICBP_ID','circBase_ID','tsRFun_ID',
                                        'tRF_ID','MINTbase_ID','TBDB_ID','RSwitch_ID','Apta-Index_ID',
                                        'NCBI_ID','Modomics_ID','NIH_ID','SmProt_ID','Reactome_ID',
                                        'WikiPathways_ID','cncRNAdb_ID','COSMIC_ID','RNAcentral_ID'])

db_entities.to_pickle(unprocessed_property_data_location + 'db_entities.pkl')
db_entities.head(n=3)

In [None]:
'''file = []
for filename in os.listdir(unprocessed_edge_data_location):
    if filename.endswith('.pkl') and filename.split('_')[0] != 'OBO':
        file.append(filename)

start_id = set()
for f in file :
    df = pd.read_pickle(unprocessed_edge_data_location + f)
    print(f)
    start_id.update(df[':START_ID'].unique())
    print(list(start_id)[:3])
    print(list(start_id)[-3:])

file = []
for filename in os.listdir(unprocessed_edge_data_location):
    if filename.endswith('.pkl') and filename.split('_')[-1].replace('.pkl', '') != 'OBO':
        file.append(filename)  

end_id = set()
for f in file :
    df = pd.read_pickle(unprocessed_edge_data_location + f)
    print(f)
    end_id.update(df[':END_ID'].unique())
    print(list(end_id)[:3])
    print(list(end_id)[-3:])

db_entities_in_KG = pd.DataFrame(list(start_id.union(end_id))).drop_duplicates().reset_index(drop=True)
db_entities_in_KG.to_pickle(unprocessed_property_data_location + 'db_entities_in_KG.pkl')
db_entities_in_KG.head(n=3)'''

In [None]:
'''db_entities_in_KG = pd.read_pickle(unprocessed_property_data_location + 'db_entities_in_KG.pkl')
db_entities_in_KG[0] = db_entities_in_KG[0].astype(str)
db_entities = pd.read_pickle(unprocessed_property_data_location + 'db_entities.pkl')
db_entities['KG_ID'] = db_entities['KG_ID'].astype(str)
db_entities = db_entities[db_entities['KG_ID'].isin(db_entities_in_KG[0])]
db_entities.head(n=3)'''

***
# OBO

In [None]:
# Takes 35/50 minutes
obo_graph = Graph()
obo_graph.parse(ontology_data_location + 'merged_with_imports.owl')
print('There are {} edges in the ontology.'.format(len(obo_graph)))

In [None]:
hasOBONamespace = URIRef("http://www.geneontology.org/formats/oboInOwl#hasOBONamespace")
dbxref_uri = URIRef("http://www.geneontology.org/formats/oboInOwl#hasDbXref")
chargeName = URIRef("http://purl.obolibrary.org/obo/chebi/charge")
massName = URIRef("http://purl.obolibrary.org/obo/chebi/mass")
smilesName = URIRef("http://purl.obolibrary.org/obo/chebi/smiles")
formulaName = URIRef("http://purl.obolibrary.org/obo/chebi/formula")
inchikeyName = URIRef("http://purl.obolibrary.org/obo/chebi/inchikey")
'''
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')
cls = {x for x in gets_ontology_classes(obo_graph)}
master_synonyms = {x for x in obo_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)}

print("Caching...")
cache_labels = {
    x: next(
        iter(
            {val for val in obo_graph.objects(x, RDFS.label) if val.language == 'en'} or
            {val for val in obo_graph.objects(x, RDFS.label) if val.language is None} or
            {val for val in obo_graph.objects(x, RDFS.label)} or
            {val for val in obo_graph.objects(x, obo.VO_0003158)}
        ),
        np.nan
    )
    for x in cls
}
cache_synonyms = {
    x: {val for val in obo_graph.objects(x, RDFS.label)}.union(
            {val for val in obo_graph.objects(x, obo.VO_0003158)}).union(
                {val for val in obo_graph.objects(x, obo.IAO_0000118)}).union(
                    {val for val in obo_graph.objects(x, obo.VO_0003099)}).union(
                        {val for val in obo_graph.objects(x, obo.OBI_9991118)})
    for x in cls
}
cache_descriptions = {
    x: next(
        iter(
            {str(desc).lower().strip() for desc in obo_graph.objects(x, obo.IAO_0000115) if desc.language == 'en'} or
            {str(desc).lower().strip() for desc in obo_graph.objects(x, obo.IAO_0000115) if desc.language is None} or
            {str(desc).lower().strip() for desc in obo_graph.objects(x, obo.IAO_0000115)}
        ),
        np.nan
    )
    for x in cls
}

cache_go_vocab = {x: set(obo_graph.objects(x, hasOBONamespace)) for x in cls}
cache_fda = {x: set(obo_graph.objects(x, obo.VO_0003160)) for x in cls}
cache_charge = {x: set(obo_graph.objects(x, chargeName)) for x in cls}
cache_mass = {x: set(obo_graph.objects(x, massName)) for x in cls}
cache_smiles = {x: set(obo_graph.objects(x, smilesName)) for x in cls}
cache_formula = {x: set(obo_graph.objects(x, formulaName)) for x in cls}
cache_inchikey = {x: set(obo_graph.objects(x, inchikeyName)) for x in cls}
cache_dbxref = {x: set(obo_graph.objects(x, dbxref_uri)) for x in cls}

cache_dict = {
    "classes": cls,
    "syn": master_synonyms,
    "labels": cache_labels,
    "synonyms": cache_synonyms,
    "descriptions": cache_descriptions,
    "go_vocab": cache_go_vocab,
    "fda": cache_fda,
    "charge": cache_charge,
    "mass": cache_mass,
    "smiles": cache_smiles,
    "formula": cache_formula,
    "inchikey": cache_inchikey,
    "dbxref": cache_dbxref
}

cache_path = unprocessed_property_data_location + "obo_cache.pkl"

with open(cache_path, "wb") as f:
    pickle.dump(cache_dict, f)

print("Cache saved")

'''

cache_path = unprocessed_property_data_location + "obo_cache.pkl"

if os.path.exists(cache_path):
    with open(cache_path, "rb") as f:
        cache_dict = pickle.load(f)
    
    cls = cache_dict["classes"]
    master_synonyms = cache_dict["syn"]
    cache_labels = cache_dict["labels"]
    cache_synonyms = cache_dict["synonyms"]
    cache_descriptions = cache_dict["descriptions"]
    cache_go_vocab = cache_dict["go_vocab"]
    cache_fda = cache_dict["fda"]
    cache_charge = cache_dict["charge"]
    cache_mass = cache_dict["mass"]
    cache_smiles = cache_dict["smiles"]
    cache_formula = cache_dict["formula"]
    cache_inchikey = cache_dict["inchikey"]
    cache_dbxref = cache_dict["dbxref"]

    print("Cache loaded")
else:
    print("Cache non found")

def process_class_metadata(cls_item):
    #print("Processing: ", cache_labels.get(cls_item, np.nan))
    fda = cache_fda.get(cls_item, set())
    charge = cache_charge.get(cls_item, set())
    mass = cache_mass.get(cls_item, set())
    smiles = cache_smiles.get(cls_item, set())
    formula = cache_formula.get(cls_item, set())
    inchikey = cache_inchikey.get(cls_item, set())

    return str(cls_item), {
        'Label': cache_labels.get(cls_item, np.nan),
        'Description': cache_descriptions.get(cls_item, np.nan),
        'Synonym': cache_synonyms.get(cls_item, set()),
        "GOvocab": cache_go_vocab.get(cls_item, set()),
        'FDA_indications': next(iter(fda if isinstance(fda, set) else set()), np.nan),
        'Charge': next(iter(charge if isinstance(charge, set) else set()), np.nan),
        'Mass': next(iter(mass if isinstance(mass, set) else set()), np.nan),
        'SMILES': next(iter(smiles if isinstance(smiles, set) else set()), np.nan),
        'Formula': next(iter(formula if isinstance(formula, set) else set()), np.nan),
        'InChIKey': next(iter(inchikey if isinstance(inchikey, set) else set()), np.nan),
        'DbXref': cache_dbxref.get(cls_item, set())
    }

results = []
for cls_item in tqdm(cls, desc="Processing classes"):
    results.append(process_class_metadata(cls_item))

relation_metadata_dict = dict(results)
pd.DataFrame.from_dict(relation_metadata_dict, orient='index').to_pickle(unprocessed_property_data_location + 'obo_raw2.pkl')

# Adjust values
for x in relation_metadata_dict.values():
    if x['Label'] and x['Synonym']:
        x['Synonym'] = {str(i) for i in x['Synonym'].union({x['Label']}) - {x['Label']}}  # The rest as synonyms
    else:
        x['Label'] = np.nan 
    
    if not x['Label'] and x['Synonym']:
        label = list(x['Synonym'])
        x['Label'] = label[0]  # Keep the first synonym as label
        x['Synonym'] = {str(i) for i in x['Synonym'].union(label) - {x['Label']}}  # The rest as synonyms
    
    if x['Synonym']:
        x['Synonym'] = {str(i) for i in x['Synonym']}  

    if x['FDA_indications']:
        x['FDA_indications'] = str(x['FDA_indications'])
        if x['FDA_indications'] == 'nan':
            x['FDA_indications'] = np.nan

    if x['Description']:
        x['Description'] = str(x['Description'])
        if x['Description'] == 'nan':
            x['Description'] = np.nan

    if x['SMILES']:
        x['SMILES'] = str(x['SMILES'])
        if x['SMILES'] == 'nan':
            x['SMILES'] = np.nan

    if x['InChIKey']:
        x['InChIKey'] = str(x['InChIKey'])
        if x['InChIKey'] == 'nan':
            x['InChIKey'] = np.nan

    if x['Formula']:
        x['Formula'] = str(x['Formula'])
        if x['Formula'] == 'nan':
            x['Formula'] = np.nan

    if x['GOvocab']:
        x['GOvocab'] = {str(i).capitalize() for i in x['GOvocab'] if str(i) == 'biological_process' or 
                        str(i) == 'molecular_function' or  str(i) == 'cellular_component'}  
        
df = pd.DataFrame.from_dict(relation_metadata_dict, orient='index')

df['Charge'] = df['Charge'].astype(str).str.replace(r'^\+', '', regex=True)  # Remove `+`
df['Charge'] = pd.to_numeric(df['Charge'], errors='coerce')

df['Mass'] = df['Mass'].astype(str).str.replace(r'^\+', '', regex=True)  # Remove `+`
df['Mass'] = pd.to_numeric(df['Mass'], errors='coerce')

df[':ID'] = df.index
df = df.reset_index(drop=True)
df.to_pickle(unprocessed_property_data_location + 'obo2.pkl')
#df = pd.read_pickle(unprocessed_property_data_location + 'obo2.pkl')
df.head(n=2)

We add properties for DrugBank nodes.

In [None]:
DrugBank = pd.read_csv(processed_data_location + 'DrugBank/drugbank vocabulary.csv')[[
    'DrugBank ID','Common name','CAS','Synonyms','Standard InChI Key']]
links = pd.read_csv(processed_data_location + 'DrugBank/drug links.csv',dtype={'ChEBI ID':str})[['DrugBank ID', 'ChEBI ID']]
links['ChEBI ID'] = 'CHEBI_' + links['ChEBI ID']
DrugBank = pd.merge(DrugBank,links,on='DrugBank ID')
DrugBank['ChEBI ID'] = DrugBank['ChEBI ID'].fillna(DrugBank['DrugBank ID'])
DrugBank.rename(columns={'ChEBI ID':'ID'},inplace=True)
sequences = pd.DataFrame([
    {'ID': record.id.replace("drugbank_drug|", ""), 'Sequence': str(record.seq)}
    for record in SeqIO.parse(processed_data_location + 'DrugBank/drug sequences.fasta', 'fasta')
])
DrugBank = pd.merge(DrugBank,sequences,on='ID',how='outer')
DrugBank['ID'] = "http://purl.obolibrary.org/obo/" + DrugBank['ID']
DrugBank['DrugBank ID'] = "https://go.drugbank.com/drugs/" + DrugBank['DrugBank ID']
DrugBank['Type'] = "Chemical, Drug"
DrugBank.head(n=3)

In [None]:
file = []
for filename in os.listdir(processed_data_location+"/DrugBank"): 
    if filename.endswith('.txt'):
        tmp = pd.read_csv(f'{processed_data_location}/DrugBank/{filename}', sep="\t")
        tmp['Type'] = 'Chemical, Drug, RNA, RNA_drug, ' + filename.split('-')[0]
        file.append(tmp[['Drug', 'Type']].drop_duplicates())

df_final = pd.concat(file, ignore_index=True)
df_final['Type'] = df_final['Type'].str.replace("siRNA", "ncRNA, sncRNA, small_regulatory_ncRNA, siRNA")
df_final['Type'] = df_final['Type'].str.replace("aptamer", "ncRNA, sncRNA, oligo, aptamer, RNA_aptamer")
df_final['Type'] = df_final['Type'].str.replace("ASO", "ncRNA, sncRNA, oligo, antisense_oligonucleotide, RNA_antisense_oligonucleotide")
df_final['Type'] = df_final['Type'].str.replace("mRNAv", "Vaccine, mRNA, RNA_vaccine, mRNA_vaccine")
df_final.head(n=3)

In [None]:
DrugBank = pd.merge(DrugBank,df_final,left_on='Common name',right_on='Drug',how='left')
DrugBank['Type_y'] = DrugBank['Type_y'].fillna(DrugBank['Type_x'])
DrugBank.rename(columns={'Type_y':'Type'},inplace=True)
DrugBank.rename(columns={'Common name':'Label', 'Synonyms':'Synonym', 'Standard InChI Key':'InChIKey', 'ID':':ID'},inplace=True)
DrugBank = DrugBank[[':ID', 'Label', 'CAS', 'Synonym', 'InChIKey', 'Sequence', 'Type']]
DrugBank['Type'] = DrugBank['Type'].str.split(", ")
DrugBank['Type'] =  DrugBank['Type'].apply(lambda items: {i for i in items})
DrugBank['Synonym'] = DrugBank['Synonym'].astype(str).str.split(" \| ")
DrugBank['Synonym'] =  DrugBank['Synonym'].apply(lambda items: {i for i in items if i != 'nan'})

def adjust_label(row):
    label = (row['Label'])
    row['Label'] = label
    row['Synonym'] = row['Synonym'] - {label}
    return row

DrugBank = DrugBank.apply(adjust_label, axis=1)

DrugBank.head(n=3)

In [None]:
df = df.merge(DrugBank, on=':ID', how='outer')
df['Label_x'] = df['Label_x'].fillna(df['Label_y'])
df.rename(columns={'Label_x':'Label'},inplace=True)
df['InChIKey_x'] = df['InChIKey_x'].fillna(df['InChIKey_y'])
df.rename(columns={'InChIKey_x':'InChIKey'},inplace=True)
df['Synonym_x'] = df['Synonym_x'].apply(lambda x: x if isinstance(x, set) else set())
df['Synonym_y'] = df['Synonym_y'].apply(lambda x: x if isinstance(x, set) else set())
df['Synonym'] = df.apply(lambda row: row['Synonym_x'].union(row['Synonym_y']), axis=1).apply(lambda items: {i for i in items})
df['Type'] = df['Type'].apply(lambda x: x if isinstance(x, set) else set())
df['GOvocab'] = df['GOvocab'].apply(lambda x: x if isinstance(x, set) else set())
df['Type'] = df.apply(lambda row: row['Type'].union(row['GOvocab']), axis=1).apply(lambda items: {i for i in items})
df.drop(columns=['Label_y', 'Synonym_x', 'Synonym_y', 'GOvocab', 'InChIKey_y'], inplace=True)
df.head(n=3)

We add protein sequences from UniProtKB.

In [None]:
!wget https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz -O $unprocessed_property_data_location/uniprot_sprot.fasta.gz
!gunzip $unprocessed_property_data_location/uniprot_sprot.fasta.gz

In [None]:
records = []
for rec in SeqIO.parse(unprocessed_property_data_location + 'uniprot_sprot.fasta', 'fasta'):
    records.append({'ID': rec.id, 'Sequence': str(rec.seq)})
sprot = pd.DataFrame(records)
sprot.ID = sprot.ID.str.split("\|").str[1]
sprot.head(n=3)

In [None]:
df['DbXref'].astype(str).unique()

In [None]:
df['DbXref'] = df['DbXref'].apply(lambda x: x if isinstance(x, set) else set())
df['DbXref'] = df['DbXref'].apply(lambda x: {i.split(":")[1] for i in x if i.startswith('UniProtKB:') and len(i.split(":")) > 1})
df['DbXref'] = df['DbXref'].apply(lambda x: x.pop() if x else np.nan)

df = pd.merge(df, sprot, left_on='DbXref', right_on='ID', how='left').drop(columns=['ID','DbXref'])
df['Sequence_x'] = df['Sequence_x'].fillna(df['Sequence_y'])
df.rename(columns={'Sequence_x':'Sequence'},inplace=True)
df = df.drop(columns=['Sequence_y'])
df.to_pickle(unprocessed_property_data_location + 'obo2.pkl')
df.head(n=3)

We add DrugBank to ontology kg as we treated them as ontology classes.

In [None]:
DrugBank = pd.read_csv(processed_data_location + 'DrugBank/drugbank vocabulary.csv')
links = pd.read_csv(processed_data_location + 'DrugBank/drug links.csv',dtype={'ChEBI ID':str})[['DrugBank ID', 'ChEBI ID']]
links['ChEBI ID'] = 'CHEBI_' + links['ChEBI ID']
DrugBank = pd.merge(DrugBank,links,on='DrugBank ID')
DrugBank['ChEBI ID'] = DrugBank['ChEBI ID'].fillna(DrugBank['DrugBank ID'])
DrugBank = DrugBank[['ChEBI ID', 'Common name']]
DrugBank.head(n=3)

In [None]:
file = []
for filename in os.listdir(processed_data_location+"/DrugBank"): 
    if filename.endswith('.txt'):
        tmp = pd.read_csv(f'{processed_data_location}/DrugBank/{filename}', sep="\t")
        tmp[':END_ID'] = "http://purl.obolibrary.org/obo/CHEBI_23888, " + filename.split('-')[0]
        file.append(tmp[['Drug', ':END_ID']].drop_duplicates())

df_final = pd.concat(file, ignore_index=True)
df_final[':END_ID'] = df_final[':END_ID'].str.replace('ASO', 'http://purl.obolibrary.org/obo/SO_0001247, http://purl.obolibrary.org/obo/SO_0000644')
df_final[':END_ID'] = df_final[':END_ID'].str.replace('aptamer', 'http://purl.obolibrary.org/obo/SO_0001247, http://purl.obolibrary.org/obo/SO_0000033')
df_final[':END_ID'] = df_final[':END_ID'].str.replace('mRNAv', 'http://purl.obolibrary.org/obo/SO_0000234, http://purl.obolibrary.org/obo/VO_0000186, http://purl.obolibrary.org/obo/SO_0000351')
df_final[':END_ID'] = df_final[':END_ID'].str.replace('siRNA', 'http://purl.obolibrary.org/obo/SO_0000646, http://purl.obolibrary.org/obo/SO_0000351')
df_final.head(n=3)

In [None]:
DrugBank = pd.merge(DrugBank,df_final,left_on='Common name',right_on='Drug',how='outer')
DrugBank[':END_ID'] = DrugBank[':END_ID'].fillna("http://purl.obolibrary.org/obo/CHEBI_23888")
DrugBank[':END_ID'] = DrugBank[':END_ID'].str.split(", ")
DrugBank = DrugBank.explode(':END_ID')
DrugBank = DrugBank[['ChEBI ID', ':END_ID']].drop_duplicates().dropna()
DrugBank.head(n=3)

In [None]:
DrugBank['Source'] = 'Entity_linking'
DrugBank[':TYPE'] = 'subclassof'
DrugBank[':START_ID'] = DrugBank['ChEBI ID'].apply(
    lambda val: f"http://purl.obolibrary.org/obo/{val}"
        if val.startswith('CHEBI')
        else f"https://go.drugbank.com/drugs/{val}"
        if val.startswith('DB')
        else val
)
DrugBank = DrugBank.drop(columns=['ChEBI ID'])
DrugBank.head(n=3)

In [None]:
merged_ontology_kg = pd.read_csv(ontology_data_location + 'merged_ontology_kg.txt', sep='\t', names=[':START_ID', ':TYPE', ':END_ID', 'Source'])
merged_ontology_kg[':START_ID'] = merged_ontology_kg[':START_ID'].str.replace("http://identifiers.org/ncbigene/", "http://www.ncbi.nlm.nih.gov/gene/")
merged_ontology_kg[':END_ID'] = merged_ontology_kg[':END_ID'].str.replace("http://identifiers.org/ncbigene/", "http://www.ncbi.nlm.nih.gov/gene/")
merged_ontology_kg['Source'] = merged_ontology_kg['Source'].str.replace("\'", "", regex=True)
merged_ontology_kg['Source'] = merged_ontology_kg['Source'].str.replace("]", "").str.replace("[", "")
merged_ontology_kg = pd.concat([merged_ontology_kg, DrugBank])
merged_ontology_kg = merged_ontology_kg.groupby([':START_ID', ':TYPE', ':END_ID']).agg({'Source':set}).reset_index()

obo_graph = nx.DiGraph()
for _, row in tqdm(merged_ontology_kg.iterrows(), desc="Importing edges", total=merged_ontology_kg.shape[0]):
    obo_graph.add_edge(row[':START_ID'], row[':END_ID'], relation=row[':TYPE'])
print(f'There are {obo_graph.number_of_edges()} edges in the ontology.')

merged_ontology_kg.head(n=3)

In [None]:
def get_superclasses(cls, graph):
    stack = [cls]
    superclasses = set()
    visited = set()
    while stack:
        current = stack.pop()
        if current in visited:
            continue
        visited.add(current)
        superclasses.add(str(current))
        stack.extend(e for _, e, data in graph.out_edges(current, data=True) if data["relation"] == "subclassof")
    return superclasses

relation_metadata_dict = {}

# Estrazione delle informazioni dall'ontologia
nodes = set(obo_graph.nodes)

for x in tqdm(nodes, desc="Processing ontology nodes"):
    superclasses = get_superclasses(x, obo_graph)
    relation_metadata_dict[str(x)] = {"Hierarchy": superclasses}

# Creazione del DataFrame
df = pd.DataFrame.from_dict(relation_metadata_dict, orient='index')
df.to_pickle(unprocessed_property_data_location + 'obo_merged_raw.pkl')
df = pd.read_pickle(unprocessed_property_data_location + 'obo_merged_raw.pkl')

# Inizializzazione delle colonne
categories = [
    'Histone_modification', 'Epigenetic_modification', 'Cardiovascular_disease',
    'Neurodegenerative_disease', 'Infectious_disease', 'Autoimmune_disease', 'Biological_role',
    'Inflammatory_disease', 'Drug', 'Cancer', 'GO', 'Viral_protein', 'Human_protein'
]
for category in categories:
    df[category] = ""

# Popolamento delle categorie
mapping = {
    "http://purl.obolibrary.org/obo/SO_0001700": 'Histone_modification',
    "http://purl.obolibrary.org/obo/SO_0001720": 'Epigenetic_modification',
    "http://purl.obolibrary.org/obo/MONDO_0005267": 'Cardiovascular_disease',
    "http://purl.obolibrary.org/obo/MONDO_0005559": 'Neurodegenerative_disease',
    "http://purl.obolibrary.org/obo/MONDO_0005550": 'Infectious_disease',
    "http://purl.obolibrary.org/obo/MONDO_0007179": 'Autoimmune_disease',
    "http://purl.obolibrary.org/obo/MONDO_0045024": 'Cancer',
    "http://purl.obolibrary.org/obo/CHEBI_23888": 'Drug',
    "http://purl.obolibrary.org/obo/CHEBI_24432": 'Biological_role',
    "http://purl.obolibrary.org/obo/PR_000029067": 'Human_protein',
    "http://purl.obolibrary.org/obo/PR_000036197": 'Viral_protein'
}

for uri, category in mapping.items():
    df.loc[df['Hierarchy'].apply(lambda h: uri in h), category.split(':')[0]] += category.split(':')[-1] + ", "

df['Type'] = df[categories].apply(lambda x: ", ".join(filter(None, x)), axis=1)
df['Type'] = df['Type'].str.replace(', , ', ', ')
df['Type'] = df['Type'].str.replace(', $', '', regex=True)
df['Type'] = df['Type'].replace('', np.nan)

df.drop(columns=['Hierarchy'] + categories[:-1], inplace=True)
df[':ID'] = df.index
df['Type'] = df['Type'].apply(lambda x: x.split(", ") if isinstance(x, str) else [])
df['Type'] = df['Type'].apply(lambda items: {i for i in items if pd.notna(i)})
df = df.reset_index(drop=True)
df.to_pickle(unprocessed_property_data_location + 'obo_merged.pkl')
df.head(n=3)

In [None]:
from Bio import Entrez
species = pd.DataFrame(merged_ontology_kg[merged_ontology_kg[':TYPE'] == 'only_in_taxon'][':END_ID'].unique())
species[0] = species[0].str.split("_").str[-1]
species[0] = species[0].str.lstrip("0")

def get_species_name(taxon_id):
    Entrez.email = "emanuele.cavalleri@unimi.it"
    try:
        handle = Entrez.esummary(db="taxonomy", id=taxon_id, retmode="xml")
        record = Entrez.read(handle)
        species_name = record[0]["ScientificName"]
    except Exception as e:
        species_name = None  
        print(f"Errore nel recupero del nome per il taxon ID {taxon_id}: {e}")
    return species_name

species[1] = species[0].apply(lambda x: get_species_name(x))
species.head(n=3)

In [None]:
species_df = merged_ontology_kg[merged_ontology_kg[':TYPE'] == 'only_in_taxon'][[':START_ID',':END_ID']]
species['Species'] = species[1]  
species.drop(columns=[1], inplace=True) 
species[0] = "http://purl.obolibrary.org/obo/NCBITaxon_" + species[0]    
species_df = pd.merge(species_df, species, left_on=':END_ID', right_on=0, how='left').drop(columns=[0])
species_df.head(n=3)     

In [None]:
df.drop(columns=['Human_protein'], inplace=True)
df = pd.merge(df, species_df, left_on=':ID', right_on=':START_ID', how='left').drop(columns=[':START_ID',':END_ID'])
df.to_pickle(unprocessed_property_data_location + 'obo_merged.pkl')
df.head(n=3)

In [None]:
df1 = pd.read_pickle(unprocessed_property_data_location + 'obo_merged.pkl')
df2 = pd.read_pickle(unprocessed_property_data_location + 'obo2.pkl')
df = pd.merge(df1, df2, on=':ID', how='left')
df['Type_x'] = df['Type_x'].apply(lambda x: x if isinstance(x, set) else set())
df['Type_y'] = df['Type_y'].apply(lambda x: x if isinstance(x, set) else set())
df['Type'] = df.apply(lambda row: row['Type_x'].union(row['Type_y']), axis=1).apply(lambda items: {i for i in items})
df.drop(columns=['Type_x', 'Type_y'], inplace=True)
df.to_csv(unprocessed_property_data_location + 'obo.csv', index=None)
df.head(n=3)

We add and adjust nodes types according to the information extracted from the ontology.

In [None]:
df = pd.read_csv(unprocessed_property_data_location + 'obo.csv')

def safe_to_set(x):
    if pd.isna(x) or x.strip() == '' or x.strip() in ['set()', '{}']:
        return set()
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        print(f"Errore nel parsing: {x!r}")
        return set()

df['Synonym'] = df['Synonym'].apply(safe_to_set)
df['Type'] = df['Type'].apply(safe_to_set)

df

In [None]:
print('Number of nodes: ' + str(len(df)))

In [None]:
# Full mapping for all node types in RNA-KG
RNAonly = False # when false all nodes are considered otherwise only RNA nodes are selected

def uri2ntype(uri: str)->Union[str,None]:
    
    retval = None
    
    if not RNAonly:    
        if ("http://purl.obolibrary.org/obo/MONDO" in uri) or ("purl.obolibrary.org/obo/DOID" in uri) or\
            ("ghr.nlm.nih.gov/condition" in uri) or ("rarediseases.info.nih.gov/diseases" in uri):
            retval = "Disease"
        elif ("purl.obolibrary.org/obo/IDO" in uri):
            retval = "Disease, Infectious_disease"
        elif ("purl.obolibrary.org/obo/MFOMD" in uri):
            retval = "Disease, Mental_disease"
        elif ("http://purl.obolibrary.org/obo/GO" in uri):
            retval = "GO"
        elif ("http://purl.obolibrary.org/obo/CHR" in uri):
            retval = "Chromosome"
        elif ("http://purl.obolibrary.org/obo/SO" in uri):
            retval = "Genomic_feature"
        elif ("//purl.obolibrary.org/obo/VO" in uri):
            retval = "Vaccine"
        elif ("http://purl.obolibrary.org/obo/CHEBI" in uri): 
            retval = "Chemical"
        elif ("http://purl.obolibrary.org/obo/PR" in uri) or ("http://purl.obolibrary.org/obo/vo/ontorat/PR" in uri): 
            retval = "Protein"
        elif ("http://purl.obolibrary.org/obo/PW" in uri): 
            retval = "Pathway"
        elif ("http://purl.obolibrary.org/obo/FOODON" in uri): 
            retval = "Food"
        elif ("http://purl.obolibrary.org/obo/MF" in uri): 
            retval = "Mental_functioning"
        elif ("http://purl.obolibrary.org/obo/OGMS" in uri): 
            retval = "General_medical_science"
        elif ("http://purl.obolibrary.org/obo/MAXO" in uri): 
            retval = "Medical_action"
        elif ("http://purl.obolibrary.org/obo/NBO" in uri):
            retval = "Neuro_behaviour"
        elif  ("http://purl.obolibrary.org/obo/CARO" in uri) or ("http://purl.obolibrary.org/obo/UBERON" in uri) or\
            ("http://sig.uw.edu/fma" in uri) or ("http://purl.obolibrary.org/obo/FMA" in uri): 
            retval = "Anatomy"  
        elif  ("http://purl.obolibrary.org/obo/NCIT" in uri): 
            retval = "NCI_thesaurus" 
        elif ("http://purl.obolibrary.org/obo/FBbt" in uri):
            retval = "Anatomy, Drosophila_anatomy"
        elif ("http://purl.obolibrary.org/obo/CL" in uri) or ("http://www.ebi.ac.uk/cellline" in uri): 
            retval = "Cell"
        elif ("http://purl.obolibrary.org/obo/HP" in uri) or ("http://purl.obolibrary.org/obo/PATO" in uri) or\
            ("http://purl.obolibrary.org/obo/UPHENO" in uri): 
            retval = "Phenotype"
        elif ("http://purl.obolibrary.org/obo/GNO" in uri): 
            retval = "Glycan"
        elif ("http://purl.obolibrary.org/obo/BFO" in uri): 
            retval = "Basic_formal"
        elif ("http://purl.obolibrary.org/obo/ENVO" in uri): 
            retval = "Environment"
        elif ("http://purl.obolibrary.org/obo/ECTO" in uri): 
            retval = "Environmental_exposure"
        elif ("http://purl.obolibrary.org/obo/PO" in uri): 
            retval = "Plant"
        elif ("http://purl.obolibrary.org/obo/FAO" in uri): 
            retval = "Anatomy, Fungal_anatomy"
        elif ("http://purl.obolibrary.org/obo/MOD" in uri): 
            retval = "Protein_modification"
        elif ("http://purl.obolibrary.org/obo/OPL" in uri): 
            retval = "Parasite_lifecycle"
        elif ("http://purl.obolibrary.org/obo/MPATH" in uri): 
            retval = "Disease, Mouse_pathology"
        elif ("http://purl.obolibrary.org/obo/OBA" in uri): 
            retval = "Trait"
        elif ("http://purl.obolibrary.org/obo/DDANAT" in uri): 
            retval = "Anatomy, Dictyostelium_discoideum_anatomy"
        elif ("http://purl.obolibrary.org/obo/OAE" in uri): 
            retval = "Adverse_events"
        elif ("http://purl.obolibrary.org/obo/BTO" in uri): 
            retval = "Anatomy, Tissue"
        elif ("www.ncbi.nlm.nih.gov/gene" in uri) or ("http://purl.obolibrary.org/obo/OGG" in uri) or\
            ("http://birdgenenames.org/cgnc/GeneReport?id=" in uri) or ("http://www.genenames.org/cgi-bin/gene_symbol_report?hgnc_id=" in uri):
            retval = "Gene"
        elif ("http://purl.obolibrary.org/obo/OGG" in uri):
            retval = "Genome"
        elif ("purl.obolibrary.org/obo/NCBITaxon" in uri): 
            retval = "Species"
        elif ("purl.obolibrary.org/obo/BSPO" in uri): 
            retval = "Spatial_concept"
        elif ("purl.obolibrary.org/obo/OPMI" in uri): 
            retval = "Precision_medicine"
        elif ("bigdata.ibp.ac.cn/SmProt/SmProt.php?ID" in uri): 
            retval = "Protein, Small_protein"
        elif ("snomedct" in uri) or ("SNOMEDCT" in uri): 
            retval = "Snomed_thesaurus"
        elif ("http://www.ebi.ac.uk/efo/EFO" in uri): 
            retval = "Experimental_factor"
        elif ("https://go.drugbank.com/drugs/" in uri): 
            retval = "Drug"
        elif ("http://purl.obolibrary.org/obo/HsapDv" in uri): 
            retval = "Human_developmental_stage"
        elif ("http://www.w3.org/2002/07/owl#Nothing" in uri): 
            retval = "owlNothing"
            
    else:
        retval = np.nan

    return retval

In [None]:
%%time
ntypes_list = []
for u in tqdm(df[":ID"].values):
    nty = uri2ntype(u)
    ntypes_list.append(nty)

df.loc[:,":TYPE"] = ntypes_list
df.tail()

In [None]:
print(df[':TYPE'].unique())
df.head(n=3)

In [None]:
print("Unassigned node types:")
print(df[df[':TYPE'].isna()]) # Must be empty

In [None]:
df[':TYPE'] = df[':TYPE'].str.split(", ").apply(lambda items: {i for i in items})
df[':TYPE'] = df.apply(lambda row: row[':TYPE'].union(row['Type']), axis=1).apply(lambda items: [i for i in items]).apply(json.dumps)
df.drop(columns=['Type'], inplace=True)
df.head(n=3)

Finally, we save nodes.csv.

In [None]:
db_entities_in_KG = pd.read_pickle(unprocessed_property_data_location + 'db_entities_in_KG.pkl')
db_entities_in_KG[0] = db_entities_in_KG[0].astype(str)
db_entities = pd.read_pickle(unprocessed_property_data_location + 'db_entities.pkl')
db_entities['KG_ID'] = db_entities['KG_ID'].astype(str)
#db_entities = db_entities[db_entities['KG_ID'].isin(db_entities_in_KG[0])] # If you want to filter the entities according to the KG

obo = df.copy()
obo['Synonym'] = obo['Synonym'].apply(lambda x: x if isinstance(x, set) else set())
obo['Synonym'] = obo['Synonym'].apply(lambda items: [i for i in items]).apply(json.dumps)
obo['KG_ID'] = obo[':ID'].str.replace("http://purl.obolibrary.org/obo/", "").str.replace("https://go.drugbank.com/drugs/", "")
obo['OBO_ID'] = np.where(
    obo[':ID'].str.startswith("http://purl.obolibrary.org/obo/"),
    obo[':ID'].str.replace("http://purl.obolibrary.org/obo/", "").str.replace("_", ":"),
    np.nan
)
obo['DrugBank_ID'] = np.where(
    obo[':ID'].str.startswith("https://go.drugbank.com/drugs/"),
    obo[':ID'].str.replace("https://go.drugbank.com/drugs/", ""),
    np.nan
)

obo['DrugBank_ID'] = "DrugBank:" + obo['DrugBank_ID']
obo['ID'] = obo['OBO_ID'].fillna(obo['DrugBank_ID'])
obo = obo.drop(columns=['OBO_ID','DrugBank_ID'])

nodes = pd.concat([db_entities, obo], ignore_index=True)
nodes.rename(columns={':ID':'URI:ID'}, inplace=True)
nodes.drop_duplicates(subset=['URI:ID'], keep='first', inplace=True)
nodes.to_csv(processed_data_location + 'nodes.csv', index=None)
nodes.head(n=3)

For each edge file (.pkl) in unprocessed_edge_data_location, keep only the edges for which the ends are contained in nodes and map identifiers.

In [None]:
# Show all names of edges' properties
sett = set()
for filename in os.listdir(unprocessed_edge_data_location):
    if filename.endswith('.pkl'):
        print(pd.read_pickle(unprocessed_edge_data_location + filename).columns)
        for i in pd.read_pickle(unprocessed_edge_data_location + filename).columns:
            sett.add(i)
sett

'''
sett = {':END_ID', ':START_ID', ':TYPE',  'Abundance',  'Binding_pos', 'Distance', 'Drug', 'Exon', 'FDR', 'FPKM', 'Fold_Change', 'GO_evidence',
         'GeneMANIA_weight', 'Interactor', 'Knockdown_percentage', 'Location', 'Maximum_RPM', 'Method', 'Minimum_free_energy_kcal_mol', 'Mutation',
         'Number_of_oligos', 'Position', 'PubMedID', 'RCI', 'RNAsister_score', 'Regulator', 'Rfam_score', 'Source', 'TANRIC_score', 'TPM', 'TYPE',
         'Weighted_CS_score', 'log2FC', 'miRDB_score', 'miTG_score', 'microT_score', 'p-value', 'zScore'}
'''

In [None]:
nodes = pd.read_csv(processed_data_location + 'nodes.csv')
nodes_map = nodes[['KG_ID','URI:ID']].rename(columns={'URI:ID':':ID'})

for filename in os.listdir(unprocessed_edge_data_location):
    if filename.endswith('.pkl'):
        print("Processing:", filename)
        df = pd.read_pickle(unprocessed_edge_data_location + filename)
        df[':START_ID'] = df[':START_ID'].astype(str)
        df[':END_ID'] = df[':END_ID'].astype(str)
        df = df.drop_duplicates(subset=[':START_ID', ':END_ID'], keep='first')
        df = df.merge(nodes_map, left_on=':START_ID', right_on='KG_ID', how='inner').drop(
            columns=[':START_ID','KG_ID']).rename(columns={':ID':':START_ID'})
        df = df.merge(nodes_map, left_on=':END_ID', right_on='KG_ID', how='inner').drop(
            columns=[':END_ID','KG_ID']).rename(columns={':ID':':END_ID'})
        df['Source'] = df['Source'].apply(lambda items: [i for i in items]).apply(json.dumps)
        cols = df.columns
        if 'TYPE' in cols:
            df.rename(columns={'TYPE':':TYPE'}, inplace=True)
        if ':TYPE' not in cols:
            df[':TYPE'] = filename.split('_', 1)[-1].rsplit('_', 1)[0]
        if 'Method' in cols:
            df['Method'] = df['Method'].apply(lambda items: [i for i in items if i != 'nan' and pd.notna(i)]
                                              ).apply(json.dumps)
        if 'Location' in cols:
            df['Location'] = df['Location'].apply(lambda items: [i for i in items if i != 'nan' and pd.notna(i)]
                                                  ).apply(json.dumps)
            df.rename(columns={'Location':'Context'}, inplace=True)
        if 'GO_evidence' in cols:
            df['GO_evidence'] = df['GO_evidence'].apply(lambda items: [i for i in items if i != 'nan' and pd.notna(i)]
                                                        ).apply(json.dumps)
        if 'PubMedID' in cols:
            df['PubMedID'] = df['PubMedID'].apply(lambda items: [str(i) for i in items if str(i) != 'nan'
                                                                 and pd.notna(i)]).apply(json.dumps)
        if 'Mutation' in cols:
            df['Mutation'] = df['Mutation'].apply(lambda items: [i for i in items if i != 'nan' and pd.notna(i)]
                                                  ).apply(json.dumps)
        if 'Interactor' in cols:
            df['Interactor'] = df['Interactor'].apply(lambda items: [i for i in items if i != 'nan' and pd.notna(i)]
                                                      ).apply(json.dumps)
        if 'Regulator' in cols:
            df['Regulator'] = df['Regulator'].apply(lambda items: [i for i in items if i != 'nan' and pd.notna(i)]
                                                    ).apply(json.dumps)
        if 'Exon' in cols:
            df['Exon'] = df['Exon'].apply(lambda items: [i for i in items if i != 'nan' and pd.notna(i)]).apply(json.dumps)
        if 'Drug' in cols:
            df['Drug'] = df['Drug'].apply(lambda items: [i for i in items if i != 'nan' and pd.notna(i)]).apply(json.dumps)
        if 'Knockdown_percentage' in cols:
            df['Knockdown_percentage'] = df['Knockdown_percentage'].apply(lambda items: [i for i in items if i != 'nan' and
                                                                                         pd.notna(i)]).apply(json.dumps)
        if 'Binding_pos' in cols:
            df['Binding_pos'] = df['Binding_pos'].apply(lambda items: [i for i in items if i != 'nan' and pd.notna(i)]
                                                        ).apply(json.dumps)
        df.to_csv(unprocessed_edge_data_location + filename.replace('.pkl', '') + ".csv", index=None)

***
***
# Entity Linking
Non-ontological entities must be linked to proper classes using the RDF's subClassOf relationship.

* Genes

In [None]:
genes = pd.read_csv(processed_data_location + 'SO_GENE_TRANSCRIPT_VARIANT_TYPE_MAPPING.txt', sep='\t', header=None)
genes = genes[genes[0].astype(str).str[0].str.isdigit()]
genes['Source'] = 'Entity_linking'
genes[':START_ID'] = "http://www.ncbi.nlm.nih.gov/gene/" + genes[0].astype(str)
genes[':END_ID'] = genes[1].str.replace("SO_", 'http://purl.obolibrary.org/obo/SO_')
genes[':TYPE'] = 'subclassof'
genes = genes[[':START_ID', ':END_ID', ':TYPE', 'Source']]
genes.head(n=3)

* RNAcentral + Ensembl

In [None]:
rnacentral_el = pd.read_pickle(unprocessed_property_data_location + 'rnacentral_el.pkl')
rnacentral_el[':START_ID'] = "https://rnacentral.org/rna/" + rnacentral_el['Name'].astype(str) + '_9606'
rnacentral_el[':TYPE'] = 'subclassof'
rnacentral_el['Source'] = 'Entity_linking'
rnacentral_el = rnacentral_el[[':START_ID', 'Name', 'Category', ':TYPE', 'Source']]
rnacentral_el.head(n=3)

In [None]:
rna = pd.read_csv(processed_data_location + 'SO_GENE_TRANSCRIPT_VARIANT_TYPE_MAPPING.txt', sep='\t', header=None)
rna = rna[rna[0].str.startswith('ENST')]
rna.head(n=3)

In [None]:
rnacentral_map_ensembl = pd.read_csv(
    processed_data_location + 'RNAcentral_MAP/ensembl.tsv',
    sep='\t', names=['RNAcentral ID', 'DB', 'Ensembl transcript ID', 'Organism', 'RNA category', 'Ensembl Gene ID'])
rnacentral_map_human_ensembl = rnacentral_map_ensembl[rnacentral_map_ensembl['Organism'] == 9606].drop(
    columns=['Organism', 'DB', 'RNA category', 'Ensembl Gene ID'])
rnacentral_map_human_ensembl.head(n=3)

In [None]:
rna = pd.merge(rna, rnacentral_map_human_ensembl, left_on=0, right_on='Ensembl transcript ID', how='left')
rna['RNAcentral ID'] = rna['RNAcentral ID'].fillna(rna[0])
rna = rna[['RNAcentral ID', 1]]
rna.head(n=3)

In [None]:
rnacentral_el = rnacentral_el[~rnacentral_el['Name'].isin(rna['RNAcentral ID'])]
rnacentral_el.head(n=3)

In [None]:
rna_mapping_data = pd.read_excel(open(unprocessed_data_location + 'genomic_sequence_ontology_mappings.xlsx', 'rb'),
                                 sheet_name='GenomicType_SO_Map_09Mar2020', header=0, engine='openpyxl')
rna_mapping_data = rna_mapping_data[rna_mapping_data['Genomic'] == 'Transcript']
rna_mapping_data = rna_mapping_data[['Term', 'SO ID']]
rna_mapping_data = rna_mapping_data.sort_values(by='Term', key=lambda col: col.str.lower())

new_rows = pd.DataFrame([
    {'Term': 'RNase_P_RNA', 'SO ID': 'SO_0000386'},
    {'Term': 'SRP_RNA', 'SO ID': 'SO_0000590'},
    {'Term': 'Y_RNA', 'SO ID': 'SO_0000405'},
    {'Term': 'hammerhead_ribozyme', 'SO ID': 'SO_0000380, SO_0000374'},
    {'Term': 'ncRNA', 'SO ID': 'SO_0000655'},
    {'Term': 'pre_miRNA', 'SO ID': 'SO_0001244, SO_0000276'},
    {'Term': 'tRNA', 'SO ID': 'SO_0000253'},
    {'Term': 'telomerase_RNA', 'SO ID': 'SO_0000390'},
    {'Term': 'piRNA', 'SO ID': 'SO_0001035'},
    {'Term': 'antisense_RNA', 'SO ID': 'SO_0000644'},
    {'Term': 'precursor_RNA', 'SO ID': 'SO_0000835'},
    {'Term': 'guide_RNA', 'SO ID': 'SO_0000602'},
    {'Term': 'autocatalytically_spliced_intron', 'SO ID': 'SO_0000588'},
    {'Term': 'RNase_MRP_RNA', 'SO ID': 'SO_0000385'},
    {'Term': 'tmRNA', 'SO ID': 'SO_0000584'},
    {'Term': 'other', 'SO ID': 'SO_0000655'},
    {'Term': 'circRNA', 'SO ID': 'SO_0002291'},
    {'Term': 'vault_RNA', 'SO ID': 'SO_0000404'},
])

rna_mapping_data = pd.concat([rna_mapping_data, new_rows], ignore_index=True)
rna_mapping_data['SO ID'] = rna_mapping_data['SO ID'].str.split(", ")
rna_mapping_data = rna_mapping_data.explode('SO ID')
rna_mapping_data.head(n=3)

In [None]:
rnacentral_el = pd.merge(rna_mapping_data, rnacentral_el, left_on='Term', right_on='Category', how='right')
rnacentral_el.head(n=3)

In [None]:
rnacentral_el[rnacentral_el['SO ID'].isnull()]['Category'].unique() # must be empty

In [None]:
rnacentral_el = rnacentral_el[['Name', 'SO ID']].rename(columns={'Name': 'RNAcentral ID', 'SO ID':1})
rnacentral_el = pd.concat([rna, rnacentral_el]).drop_duplicates()
rnacentral_el['Source'] = 'Entity_linking'
rnacentral_el = rnacentral_el.rename(columns={'RNAcentral ID':':START_ID', 1:':END_ID'})
rnacentral_el[':START_ID'] = rnacentral_el[':START_ID'].apply(
    lambda val: f"https://www.ensembl.org/Homo_sapiens/Transcript/Summary?t={val}"
        if val.startswith('ENST')
        else f"https://rnacentral.org/rna/{val}_9606"
        if val.startswith('URS')
        else val
)
rnacentral_el[':TYPE'] = 'subclassof'
rnacentral_el[':END_ID'] = "http://purl.obolibrary.org/obo/" + rnacentral_el[':END_ID'].astype(str)
rnacentral_el.head(n=3)

* Addgene

In [None]:
gRNA_gene = pd.read_csv(unprocessed_data_location + 'grna_sequences_addgene.txt', sep='\t', dtype = {"Plasmid ID":str})  
gRNA_gene = gRNA_gene[['Plasmid ID']].drop_duplicates().dropna()
gRNA_gene[':END_ID'] = 'http://purl.obolibrary.org/obo/SO_0000602'
gRNA_gene['Source'] = 'Entity_linking'
gRNA_gene[':TYPE'] = 'subclassof'
gRNA_gene[':START_ID'] = "https://www.addgene.org/" + gRNA_gene['Plasmid ID']
gRNA_gene = gRNA_gene.drop(columns=['Plasmid ID'])
gRNA_gene.head(n=3)

* [The MIT/ICBP siRNA Database](http://web.mit.edu/sirna/index.html) <br /> The MIT/ICBP siRNA Database has validated siRNA and shRNA sequences against over 100 genes.

In [None]:
ICBP = pd.read_html('http://web.mit.edu/sirna/sirnas-gene.html')
ICBP = ICBP[1]
ICBP.columns = ICBP.iloc[[0]].squeeze()
ICBP.drop(0, inplace=True)
ICBP[['ID#']] = ICBP[['ID#']] + '.html'
ICBPsiRNA = ICBP.loc[(ICBP['siRNA'] == 'x') & (ICBP['Human'] == 'x')]
ICBPsiRNA = ICBPsiRNA[['ID#']].drop_duplicates().dropna()
ICBPsiRNA[':END_ID'] = 'http://purl.obolibrary.org/obo/SO_0000646'
ICBPsiRNA['Source'] = 'Entity_linking'
ICBPsiRNA[':TYPE'] = 'subclassof'

ICBPshRNA = ICBP.loc[(ICBP['shRNA'] == 'x') & (ICBP['Human'] == 'x')] # shRNA
ICBPshRNA = ICBPshRNA[['ID#']].drop_duplicates().dropna()
ICBPshRNA[':END_ID'] = 'http://purl.obolibrary.org/obo/SO_0002031'
ICBPshRNA['Source'] = 'Entity_linking'
ICBPshRNA[':TYPE'] = 'subclassof'

ICBP = pd.concat([ICBPsiRNA, ICBPshRNA])
ICBP[':START_ID'] = "http://web.mit.edu/sirna/sequences/results-" + ICBP['ID#']
ICBP = ICBP.drop(columns=['ID#'])
ICBP.head(n=3)

* circBase

In [None]:
circbase = pd.read_csv(unprocessed_data_location + 'hsa_hg19_circRNA.txt', sep='\t')
circbase = circbase[['circRNA ID']].drop_duplicates().dropna()
circbase[':END_ID'] = 'http://purl.obolibrary.org/obo/SO_0002291'
circbase['Source'] = 'Entity_linking'
circbase[':TYPE'] = 'subclassof'
circbase[':START_ID'] = "http://circbase.org/cgi-bin/singlerecord.cgi?id=" + circbase['circRNA ID']
circbase = circbase.drop(columns=['circRNA ID'])
circbase.head(n=3)

In [None]:
# Translated ncRNA --> circular_mRNA
RNA_anatomy = pd.read_excel(unprocessed_data_location + 'Translated ncRNA.xlsx')

RNA_anatomy = RNA_anatomy[RNA_anatomy.Notes != 'It has been re-annotated as protein coding gene now']
RNA_anatomy = RNA_anatomy[RNA_anatomy['Gene.ID'].notna()]
circbase2 = pd.DataFrame(RNA_anatomy[(RNA_anatomy['Type']=='circRNA') & (RNA_anatomy['Name'].str.startswith('hsa_circ_'))]['Name'].unique())
circbase2[':END_ID'] = 'http://purl.obolibrary.org/obo/SO_0002292'
circbase2['Source'] = 'Entity_linking'
circbase2[':TYPE'] = 'subclassof'
circbase2[':START_ID'] = "http://circbase.org/cgi-bin/singlerecord.cgi?id=" + circbase2[0]
circbase2 = circbase2.drop(columns=[0])
circbase2.head(n=3)

* [eSkip-Finder](https://eskip-finder.org/cgi-bin/input.cgi) <br /> eSkip-Finder is the first machine learning-based design tool and database of antisense oligonucleotides (ASOs) for exon skipping. A significant challenge, however, is the difficulty in selecting an optimal target sequence for exon skipping.

In [None]:
ASO_mRNA = pd.read_html(unprocessed_data_location + 'eSkip-Finder.html')[2]
ASO_mRNA = ASO_mRNA[ASO_mRNA['Species'] == 'human']
ASO_mRNA = ASO_mRNA[ASO_mRNA['Oligo name in literature'] != 'Null']
ASO_mRNA = ASO_mRNA[['Oligo name in literature']]
ASO_mRNA[':END_ID'] = 'http://purl.obolibrary.org/obo/SO_0000644, http://purl.obolibrary.org/obo/SO_0001247'
ASO_mRNA['Source'] = 'Entity_linking'
ASO_mRNA[':TYPE'] = 'subclassof'
ASO_mRNA[':END_ID'] = ASO_mRNA[':END_ID'].str.split(", ")
ASO_mRNA = ASO_mRNA.explode(':END_ID')
ASO_mRNA[':START_ID'] = 'https://eskip-finder.org/cgi-bin/input.cgi?' + ASO_mRNA['Oligo name in literature'].str.replace(r'\s+', '_', regex=True)
ASO_mRNA = ASO_mRNA.drop(columns=['Oligo name in literature'])
ASO_mRNA.head(n=3)

* tsRFun

In [None]:
tsRNA = pd.read_csv(unprocessed_data_location + 'newID_20210202.txt', sep="\t")[['tsRNAid']].drop_duplicates().dropna()
tsRNA[':END_ID'] = 'http://purl.obolibrary.org/obo/SO_0001172'
tsRNA['Source'] = 'Entity_linking'
tsRNA[':TYPE'] = 'subclassof'
tsRNA[':START_ID'] = 'http://biomed.nscc-gz.cn/DB/tsRFun/searchDetail-tsRNA.php?tsRNAid=' + tsRNA['tsRNAid']
tsRNA = tsRNA.drop(columns=['tsRNAid'])
tsRNA.head(n=3)

* tRFdb

In [None]:
#http://genome.bioch.virginia.edu/trfdb/index.php
tRF1_tRNA = pd.read_html(unprocessed_data_location+'trf1.html')[2]
tRF1_tRNA.drop(columns=['Organism'],inplace=True)
tRF1_tRNA.head()

tRF3_tRNA = pd.read_html(unprocessed_data_location+'trf3.html')[2]
tRF3_tRNA.drop(columns=['Organism'],inplace=True)

tRF5_tRNA = pd.read_html(unprocessed_data_location+'trf5.html')[2]
tRF5_tRNA.drop(columns=['Organism'],inplace=True)

tRF_tRNA = pd.concat([tRF1_tRNA,tRF3_tRNA,tRF5_tRNA])
tRF_tRNA = tRF_tRNA.drop(columns=['Experiment Info', 'Sequence'])
tRF_tRNA['tRF ID'] = "trfdb?" + tRF_tRNA['tRF ID'].astype(str)
tRF_tRNA = tRF_tRNA[['tRF ID']].drop_duplicates().dropna()
tRF_tRNA[':END_ID'] = 'http://purl.obolibrary.org/obo/SO_0001172'
tRF_tRNA['Source'] = 'Entity_linking'
tRF_tRNA[':TYPE'] = 'subclassof'
tRF_tRNA[':START_ID'] = 'http://genome.bioch.virginia.edu/trfdb/experiments_display.php?' + tRF_tRNA['tRF ID']
tRF_tRNA = tRF_tRNA.drop(columns=['tRF ID'])
tRF_tRNA.head(n=3)

* MINTBASE

In [None]:
# https://cm.jefferson.edu/MINTbase/InputController?g=GRCh37&d=y&v=g&e=1.0&cl=,4,5,11,12,16,18,19,21,22,26,27,#ttop
tRF_tRNA2 = pd.read_csv(unprocessed_data_location+'MINTbase.txt',sep='\t')[['License Plate (sequence derived)']].drop_duplicates().dropna()
tRF_tRNA2[':END_ID'] = 'http://purl.obolibrary.org/obo/SO_0001172'
tRF_tRNA2['Source'] = 'Entity_linking'
tRF_tRNA2[':TYPE'] = 'subclassof'
tRF_tRNA2[':START_ID'] = 'https://cm.jefferson.edu/MINTbase/InputController?v=g&g=GRCh37&fn=' + tRF_tRNA2['License Plate (sequence derived)']
tRF_tRNA2 = tRF_tRNA2.drop(columns=['License Plate (sequence derived)'])
tRF_tRNA2.head(n=3)

* TBDB

In [None]:
riboswitch_protein = pd.read_csv(unprocessed_data_location+'tbdb.csv', sep=',')[['accession_url']].drop_duplicates().dropna()
riboswitch_protein[':END_ID'] = 'http://purl.obolibrary.org/obo/SO_0000035'
riboswitch_protein['Source'] = 'Entity_linking'
riboswitch_protein[':TYPE'] = 'subclassof'
riboswitch_protein = riboswitch_protein.rename(columns={'accession_url':':START_ID'})
riboswitch_protein.head(n=3)

* RSwitch

In [None]:
riboswitch_bactStrain = pd.read_csv(unprocessed_data_location + 'rswitch.csv', header=None)[[0]].drop_duplicates().dropna()
riboswitch_bactStrain[':END_ID'] = 'http://purl.obolibrary.org/obo/SO_0000035'
riboswitch_bactStrain['Source'] = 'Entity_linking'
riboswitch_bactStrain[':TYPE'] = 'subclassof'
riboswitch_bactStrain[':START_ID'] = 'https://penchovsky.atwebpages.com/applications.php?page=58?' + riboswitch_bactStrain[0]
riboswitch_bactStrain = riboswitch_bactStrain.drop(columns=[0])
riboswitch_bactStrain.head(n=3)

* ViroidDB

In [None]:
vRNA_ribozyme = pd.read_json(unprocessed_data_location + 'all.json').T 

# Extract ribozymes 
myre = re.compile(r"\n>> .*?\n")
ribozyme = [myre.findall(i) for i in vRNA_ribozyme.ribozymes]
ribozyme = [[j.replace("\n",'').replace(">> ",'') for j in i] for i in ribozyme]

vRNA_ribozyme = pd.concat([vRNA_ribozyme.reset_index().drop(columns=['index']), # Genome --> NCBI nuccore 
                           pd.Series(ribozyme)], axis=1)
vRNA_ribozyme = vRNA_ribozyme.explode(0)
vRNA_ribozyme[0] = vRNA_ribozyme[0].str.split().str[0]
vRNA_ribozyme['accession'] = vRNA_ribozyme['accession'].str.split(".").str[0]
vRNA_ribozyme = vRNA_ribozyme[vRNA_ribozyme.species == 'Hepatitis delta virus'][['accession']].drop_duplicates().dropna()
vRNA_ribozyme[':END_ID'] = 'http://purl.obolibrary.org/obo/SO_0001200, http://purl.obolibrary.org/obo/SO_0001041'
# they are all negative_sense_ssRNA_viral_sequence
vRNA_ribozyme['Source'] = 'Entity_linking'
vRNA_ribozyme[':TYPE'] = 'subclassof'
vRNA_ribozyme[':END_ID'] = vRNA_ribozyme[':END_ID'].str.split(", ")
vRNA_ribozyme = vRNA_ribozyme.explode(':END_ID')
vRNA_ribozyme[':START_ID'] = 'https://www.ncbi.nlm.nih.gov/nuccore/' + vRNA_ribozyme['accession']
vRNA_ribozyme = vRNA_ribozyme.drop(columns=['accession'])
vRNA_ribozyme.head(n=3)

* Apta-Index

In [None]:
aptamer_protein = pd.read_csv(unprocessed_data_location + 'aptaindex.csv',names=['Label', 'ID', 'Target', 'Sequence'],skiprows=[0]) 
aptamer_protein['ID'] = 'aptamer-details/?id=' + aptamer_protein['ID'].astype(str)
aptamer_protein = aptamer_protein[['ID']].drop_duplicates().dropna()
aptamer_protein[':END_ID'] = 'http://purl.obolibrary.org/obo/SO_0000033'
aptamer_protein['Source'] = 'Entity_linking'
aptamer_protein[':TYPE'] = 'subclassof'
aptamer_protein[':START_ID'] = 'https://www.aptagen.com/' + aptamer_protein['ID']
aptamer_protein = aptamer_protein.drop(columns=['ID'])
aptamer_protein.head(n=3)

* COSMIC

In [None]:
cosmic = pd.read_pickle(unprocessed_property_data_location + 'cosmic.pkl')[[':ID',':TYPE']]
cosmic[':TYPE'] = cosmic[':TYPE'].str.replace('''["Variant", "Somatic_variant", "SNV"]''',"SNV")
cosmic[':TYPE'] = cosmic[':TYPE'].str.replace('''["Variant", "Somatic_variant", "Deletion"]''',"deletion")
cosmic[':TYPE'] = cosmic[':TYPE'].str.replace('''["Variant", "Somatic_variant", "Insertion"]''',"insertion")
cosmic[':TYPE'] = cosmic[':TYPE'].str.replace('''["Variant", "Somatic_variant", "Indel"]''',"indel")
cosmic[':TYPE'] = cosmic[':TYPE'].str.replace('''["substitution"]''',"substitution")
cosmic['SO_TERM'] = cosmic[':TYPE']

cosmic.SO_TERM = cosmic.SO_TERM.replace('SNV', 'single nucleotide variant')
print(cosmic.SO_TERM.unique())

variant_mapping_data = pd.read_excel(open(unprocessed_data_location + 'genomic_sequence_ontology_mappings.xlsx', 'rb'),
                                 sheet_name='GenomicType_SO_Map_09Mar2020', header=0, engine='openpyxl')
variant_mapping_data = variant_mapping_data[variant_mapping_data['Genomic'] == 'Variant']
variant_mapping_data = variant_mapping_data[['Term', 'SO ID']]

print(variant_mapping_data['Term'].unique())

cosmic = pd.merge(variant_mapping_data, cosmic, left_on='Term', right_on='SO_TERM', how='right')
cosmic2 = cosmic.copy()[[':ID']].drop_duplicates()
cosmic2['SO ID'] = 'SO_0001777'
cosmic = pd.concat([cosmic, cosmic2]).drop(columns=['Term', 'SO_TERM'])
cosmic[':TYPE'] = 'subclassof'
cosmic['Source'] = 'Entity_linking'
cosmic[':END_ID'] = 'http://purl.obolibrary.org/obo/' + cosmic['SO ID']
cosmic[':START_ID'] = cosmic[':ID']
cosmic = cosmic.drop(columns=['SO ID', ':ID'])
cosmic.head(n=3)

* SNP

In [None]:
variant = pd.read_csv(processed_data_location + 'SO_GENE_TRANSCRIPT_VARIANT_TYPE_MAPPING.txt', sep='\t', header=None)
variant = variant[variant[0].str.startswith('rs')]
variant['Source'] = 'Entity_linking'
variant.rename(columns={0:':START_ID', 1:':END_ID'}, inplace=True)
variant[':TYPE'] = 'subclassof'
variant[':START_ID'] = "https://www.ncbi.nlm.nih.gov/snp/" + variant[':START_ID']
variant[':END_ID'] = "http://purl.obolibrary.org/obo/" + variant[':END_ID']
variant.head(n=3)

* Reactome

In [None]:
reactome = pd.read_csv(processed_data_location + 'REACTOME_PW_GO_MAPPINGS.txt', sep='\t', header=None)
reactome['Source'] = 'Entity_linking'
reactome[':TYPE'] = 'subclassof'
reactome.rename(columns={0:':START_ID', 1:':END_ID'}, inplace=True)
reactome[':START_ID'] = "https://reactome.org/content/detail/" + reactome[':START_ID']
reactome[':END_ID'] = "http://purl.obolibrary.org/obo/" + reactome[':END_ID']
reactome.head(n=3)

* Wikipathways

In [None]:
wpwnonO_data = pd.read_csv('../resources/processed_data/DESC_WIKIPATHWAYS_MAP.txt', header=None, sep='\t')[1]

wpwnonO_data = pd.DataFrame(wpwnonO_data)
wpwnonO_data[':END_ID'] = 'http://purl.obolibrary.org/obo/PW_0000001'
wpwnonO_data['Source'] = 'Entity_linking'
wpwnonO_data[':TYPE'] = 'subclassof'
wpwnonO_data.rename(columns={1:':START_ID'}, inplace=True)
wpwnonO_data[':START_ID'] = "https://www.wikipathways.org/instance/" + wpwnonO_data[':START_ID']
wpwnonO_data.head(n=3)

* Biological roles

In [None]:
name = ['https://www.genome.gov/genetics-glossary/Tumor-Suppressor-Gene', 'https://www.genome.gov/genetics-glossary/Oncogene',
        'https://www.genome.gov/genetics-glossary/General']
role = pd.DataFrame({'Name': name})
role[':END_ID'] = 'http://purl.obolibrary.org/obo/CHEBI_24432'
role['Source'] = 'Entity_linking'
role[':TYPE'] = 'subclassof'
role.rename(columns={'Name':':START_ID'}, inplace=True)
role

* Small proteins

In [None]:
lncRNA_protein = pd.read_csv(unprocessed_data_location + 'sprotein_LncBook2.0.csv.gz') 
lncRNA_protein['SmProt ID'] = 'http://bigdata.ibp.ac.cn/SmProt/SmProt.php?ID=' + lncRNA_protein['SmProt ID']
lncRNA_protein['SmProt Protein Sequence'] = lncRNA_protein['SmProt Protein Sequence'].str.replace('*', '', regex=False)
lncRNA_protein = lncRNA_protein[['SmProt ID']].drop_duplicates().dropna()

RNA_anatomy = pd.read_excel(unprocessed_data_location + 'Translated ncRNA.xlsx').rename(columns={'cncRNAdb.ID':'SmProt ID'})
RNA_anatomy['SmProt ID'] = "https://www.rna-society.org/cncrnadb?" + RNA_anatomy['SmProt ID']
RNA_anatomy = RNA_anatomy[RNA_anatomy.Organism.str.contains('apiens')]
RNA_anatomy = RNA_anatomy[RNA_anatomy.Notes != 'It has been re-annotated as protein coding gene now']
RNA_anatomy = RNA_anatomy[['SmProt ID']].drop_duplicates().dropna()

lncRNA_protein = pd.concat([lncRNA_protein, RNA_anatomy])
lncRNA_protein[':END_ID'] = 'http://purl.obolibrary.org/obo/SO_0000104, http://purl.obolibrary.org/obo/PR_000018263'
lncRNA_protein[':END_ID'] = lncRNA_protein[':END_ID'].str.split(", ")
lncRNA_protein = lncRNA_protein.explode(':END_ID')
lncRNA_protein['Source'] = 'Entity_linking'
lncRNA_protein[':TYPE'] = 'subclassof'
lncRNA_protein.rename(columns={'SmProt ID':':START_ID'}, inplace=True)
lncRNA_protein.head(n=3)

* Chemical modifications

In [None]:
tRNA_mod = pd.read_csv(edge_data_location+'modification-tRNA2314.txt', sep='\t')[['Modification']].drop_duplicates().dropna()
tRNA_mod[':END_ID'] = 'http://purl.obolibrary.org/obo/GO_0009451, http://purl.obolibrary.org/obo/SO_0001720'
tRNA_mod[':END_ID'] = tRNA_mod[':END_ID'].str.split(", ")
tRNA_mod = tRNA_mod.explode(':END_ID')
tRNA_mod['Source'] = 'Entity_linking'
tRNA_mod[':TYPE'] = 'subclassof'
tRNA_mod[':START_ID'] = 'https://genesilico.pl/modomics?' + tRNA_mod['Modification']
tRNA_mod = tRNA_mod.drop(columns=['Modification'])
tRNA_mod.head(n=3)

In [None]:
entity_linking = pd.concat([genes, rnacentral_el, gRNA_gene, ICBP, circbase, circbase2, ASO_mRNA, tsRNA, tRF_tRNA,
                            tRF_tRNA2, riboswitch_protein, riboswitch_bactStrain, vRNA_ribozyme, aptamer_protein,
                            cosmic, variant, reactome, wpwnonO_data, role, lncRNA_protein, tRNA_mod]).drop_duplicates()
entity_linking = entity_linking[(entity_linking[':START_ID'].isin(nodes['URI:ID'])) & (entity_linking[':END_ID'].isin(nodes['URI:ID']))]
entity_linking['Source'] = entity_linking['Source'].apply(lambda x: [x]).apply(json.dumps)
entity_linking.to_csv(unprocessed_edge_data_location + 'entity_linking.csv', index=False)
entity_linking.head(n=3)

We outer join each CSV in unprocessed_edges with merged_ontology_kg to check for repeated relationships. We do that by excluding relationships we are sure can not occur in the merged_ontology_kg such as ones involving the db entities representing RNAs or COSMIC mutations.

In [None]:
type = set()
for filename in os.listdir(unprocessed_edge_data_location):
    if filename.endswith('.csv') and "RNA" not in filename and "COSMIC" not in filename and "smallProtein" not in filename and "genome" not in filename:
        type.add(filename)
list(type)[:3]

In [None]:
file = []
for filename in tqdm(os.listdir(unprocessed_edge_data_location)):
    if filename in list(type):
        print("Processing:", filename)
        lod = pd.read_csv(unprocessed_edge_data_location + filename)
        file.append(lod)
lod = pd.concat(file, ignore_index=True)
lod['Source'] = lod['Source'].str.replace('\"', '').str.replace('[', '').str.replace(']', '').str.split(', ')
lod['Source'] = lod['Source'].apply(lambda x: set(x))
lod.head(n=3)

In [None]:
merged_ontology_kg = pd.read_csv(ontology_data_location + 'merged_ontology_kg.txt', sep='\t', names=[':START_ID', ':TYPE', ':END_ID', 'Source'])
merged_ontology_kg[':START_ID'] = merged_ontology_kg[':START_ID'].str.replace("http://identifiers.org/ncbigene/", "http://www.ncbi.nlm.nih.gov/gene/")
merged_ontology_kg[':END_ID'] = merged_ontology_kg[':END_ID'].str.replace("http://identifiers.org/ncbigene/", "http://www.ncbi.nlm.nih.gov/gene/")
merged_ontology_kg['Source'] = merged_ontology_kg['Source'].str.replace("\'", "", regex=True)
merged_ontology_kg['Source'] = merged_ontology_kg['Source'].str.replace("]", "").str.replace("[", "")
merged_ontology_kg = pd.concat([merged_ontology_kg, DrugBank])
merged_ontology_kg = merged_ontology_kg.groupby([':START_ID', ':TYPE', ':END_ID']).agg({'Source':set}).reset_index()
merged_ontology_kg.head(n=3)

In [None]:
merged_ontology_and_lod_kg = pd.merge(merged_ontology_kg, lod, on=[':START_ID', ':TYPE', ':END_ID'], how='outer')
merged_ontology_and_lod_kg['Source_x'] = merged_ontology_and_lod_kg['Source_x'].apply(lambda x: set() if pd.isna(x) else x)
merged_ontology_and_lod_kg['Source_y'] = merged_ontology_and_lod_kg['Source_y'].apply(lambda x: set() if pd.isna(x) else x)
merged_ontology_and_lod_kg['Source'] = merged_ontology_and_lod_kg.apply(lambda row: row['Source_x'].union(row['Source_y']), axis=1).apply(
    lambda items: [i for i in items]).apply(json.dumps)
merged_ontology_and_lod_kg.drop(columns=['Source_x', 'Source_y'], inplace=True)
merged_ontology_and_lod_kg.head(n=3)

<!-- We append information of "meta-edges" from RO to each relation.

In [None]:
'''ro_graph = Graph()
ro_graph.parse(ontology_data_location + 'ro_with_imports.owl')
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')

def get_superproperties(axiom, ro_graph):
    superproperties = set([str(axiom)])  # Initialize with the current property
    direct_superproperties = [x for x in ro_graph.objects(axiom, RDFS.subPropertyOf)]  # Find direct superproperties of the axiom
    for sp in direct_superproperties:
        superproperties.update(get_superproperties(sp, ro_graph))  # Recurse on each superproperty
    return superproperties

cls = {x for x in gets_object_properties(ro_graph)}
master_synonyms = {x for x in ro_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)}

for x in tqdm(cls):
    labels = list({x for x in ro_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)})
    labels = labels[0] if labels else np.nan
    synonym = {str(i[2]).lower().strip().replace(" ","_").replace("-","_") for i in master_synonyms if x == i[0]}
    desc = list({str(x).lower().strip() for x in ro_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)})
    desc = desc[0] if desc else np.nan
    superproperties = get_superproperties(x, ro_graph)
    
    relation_metadata_dict[str(x)] = {'Label': labels, 'Description': desc, 'Synonym': synonym, "Hierarchy": superproperties}

ro_df = pd.DataFrame(relation_metadata_dict).T 

def clean_hierarchy(hierarchy, axiom): # Exclude the current axiom from its own superproperties hierarchy
    if str(axiom) in hierarchy:
        hierarchy.remove(str(axiom))
    return ', '.join(hierarchy) if hierarchy else np.nan

ro_df['Hierarchy'] = ro_df.apply(lambda row: clean_hierarchy(row['Hierarchy'], row.name), axis=1)
uri_to_label = {}

# We replace URIs with their labels in the hierarchy
for s in ro_graph.subjects(RDFS.label, None):
    labels = [str(o).replace(" ", "_").replace("-","_") for o in ro_graph.objects(s, RDFS.label) if '@' not in str(o) or '@en' in str(o)]
    if labels:
        uri_to_label[str(s)] = labels[0]

def replace_uris_with_labels(hierarchy):
    if pd.isna(hierarchy) or not isinstance(hierarchy, str):
        return set()
    uris = hierarchy.split(', ')
    labels = {uri_to_label.get(uri.strip(), uri) for uri in uris}
    return labels if labels else set()

ro_df['Hierarchy'] = ro_df['Hierarchy'].apply(replace_uris_with_labels)
ro_df['Label'] = ro_df['Label'].str.replace(" ","_").str.replace("-","_")
ro_df['Label'] = ro_df['Label'].dropna()
ro_df = ro_df.reset_index(drop=True)

ro_df = pd.concat([ro_df, pd.DataFrame([{"Label": "subclassof", "Synonym": set(["is_a"]), "Description": np.nan, "Hierarchy":set()}])], ignore_index=True)

ro_df_neg = ro_df.copy()
ro_df_neg['Label'] = "not_" + ro_df_neg['Label']
ro_df_neg['Description'] = ro_df_neg['Description'].apply(lambda x: np.nan if pd.isna(x) else "Negation of: " + str(x))
ro_df_neg['Hierarchy'] = ro_df_neg['Hierarchy'].apply(lambda x: set() if pd.isna(x) else {"not_" + i for i in x})
ro_df_neg['Synonym'] = ro_df_neg['Synonym'].apply(lambda x: set() if pd.isna(x) else {"not_" + i for i in x})
ro_df = pd.concat([ro_df, ro_df_neg])

ro_df['Synonym'] = ro_df['Synonym'].apply(lambda x: json.dumps(list(x)))
ro_df['Hierarchy'] = ro_df['Hierarchy'].apply(lambda x: json.dumps(list(x)))
ro_df['Label'] = ro_df['Label']
ro_df.rename(columns={'Hierarchy':'Parent'}).to_csv(unprocessed_edge_data_location + 'relation_metadata.csv', index=False)
ro_df.head(n=3)'''

In [None]:
#merged_ontology_and_lod_kg = pd.merge(merged_ontology_and_lod_kg, ro_df, left_on=':TYPE', right_on='Label', how='left').drop(columns=['Label'])
merged_ontology_and_lod_kg.to_csv(processed_data_location + 'merged_ontology_and_lod_kg.csv', index=False)
merged_ontology_and_lod_kg.head(n=3)

In [None]:
for filename in tqdm(os.listdir(unprocessed_edge_data_location)):
    if filename.endswith('.csv') and filename not in list(type):
        print("Processing:", filename)
        df = pd.read_csv(unprocessed_edge_data_location + filename)
        #df = pd.merge(df, ro_df, left_on=':TYPE', right_on='Label', how='left').drop(columns=['Label'])
        df.to_csv(processed_data_location + filename)

***
Fix Neo4j import.

In [None]:
df_nodes = pd.read_csv(processed_data_location + 'nodes.csv', low_memory=False)
df_nodes.rename(columns={':TYPE': ':LABEL','Genomic_location':'Genomic_coordinates:string[]',
                   'Mutation':'Mutation:string[]', 'Synonym':'Synonym:string[]',
                   'ID':'ID:string', 'Charge':'Charge:long','Mass':'Mass:double'}, inplace=True)
df_nodes = df_nodes.drop(columns=['KG_ID'])

for col in [':LABEL', 'Genomic_coordinates:string[]', 'Mutation:string[]', 'Synonym:string[]']:
    df_nodes[col] = df_nodes[col].apply(lambda x: ";".join([str(i).replace(";", ",").strip() for i in ast.literal_eval(x)])  
                                        if pd.notna(x) and isinstance(x, str) and x.startswith("[") else x)
    df_nodes[col] = df_nodes[col].replace("", np.nan)

df_nodes['Charge:long'] = df_nodes['Charge:long'].astype('Int64')
df_nodes['ID:string'] = df_nodes['ID:string'].replace(r"nan$", np.nan, regex=True)
df_nodes['ID:string'] = df_nodes['ID:string'].replace(r"<NA>$", np.nan, regex=True)
df_nodes.loc[((df_nodes['Label'].fillna('').str.contains('RNA binding protein', case=False) |
              df_nodes['Synonym:string[]'].fillna('').str.contains('RNA binding protein', case=False)) &
              df_nodes['URI:ID'].fillna('').str.startswith('http://purl.obolibrary.org/obo/PR_')), ':LABEL'] += ';RBP'
df_nodes.loc[((df_nodes['Label'].fillna('').str.contains('transcription factor', case=False) |
              df_nodes['Synonym:string[]'].fillna('').str.contains('transcription factor', case=False)) &
              df_nodes['URI:ID'].fillna('').str.startswith('http://purl.obolibrary.org/obo/PR_')), ':LABEL'] += ';TF'
df_nodes.loc[df_nodes['Label'].fillna('').str.startswith('piR-'), ':LABEL'] += ';sncRNA;small_regulatory_ncRNA;piRNA'
df_nodes.loc[df_nodes[':LABEL'].str.contains('RNA', na=False), 'Label'] = df_nodes.loc[
    df_nodes[':LABEL'].str.contains('RNA', na=False), 'Label'].str.replace("(human) ", "", regex=False)
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace(";nan;",";")   
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace(";nan$","",regex=True)
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("2MOe","2MOe_ASO")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("2OMOE","2OMOE_ASO")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("2OMe","2OMe_ASO")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("aptamer","Aptamer")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("bacterial_RNA","Bacterial_RNA")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("enzymatic_RNA","Enzymatic_RNA")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("ribozyme","Ribozyme")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("hammerhead_Ribozyme","Hammerhead_ribozyme")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("intron","Intron")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("retained_Intron","Retained_intron")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("modified PMO","Modified_PMO")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("unModified_PMO","Unmodified_PMO")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("oligo","Oligo")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("antisense_Oligonucleotide","Antisense_oligonucleotide")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("RNA_Antisense_oligonucleotide","RNA_antisense_oligonucleotide")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("precursor_RNA","Precursor_RNA")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("riboswitch","Riboswitch")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("small_regulatory_ncRNA","Small_regulatory_ncRNA")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("substitution","Variant;Somatic_variant;Substitution")
df_nodes[':LABEL'] = df_nodes[':LABEL'].str.replace("telomerase_RNA","Telomerase_RNA")

df_nodes.head(n=2)

In [None]:
merged_ontology_kg = pd.read_csv(processed_data_location + 'merged_ontology_and_lod_kg.csv', low_memory=False)

merged_ontology_kg['Source'] = merged_ontology_kg['Source'].astype(str)
merged_ontology_kg['Source'] = merged_ontology_kg['Source'].str.replace("\"", "", regex=True)
merged_ontology_kg['Source'] = merged_ontology_kg['Source'].str.replace("[", "")
merged_ontology_kg['Source'] = merged_ontology_kg['Source'].str.replace("]", "")
merged_ontology_kg['Source'] = merged_ontology_kg['Source'].str.split(", ")
merged_ontology_kg['Source'] = merged_ontology_kg['Source'].apply(lambda x: list(x)).apply(json.dumps)

merged_ontology_kg.rename(columns={'PubMedID':'PubMedID:string[]','GO_evidence':'GO_evidence:string[]', #'Hierarchy':'Hierarchy:string[]',
                                   'TPM':'TPM:double', 'Context':'Context:string[]', 'Method':'Method:string[]', 'p-value':'p-value:double',
                                   'FDR':'FDR:double', 'RNAsister_score':'RNAsister_score:double', 'Interactor':'Interactor:string[]', 
                                   'GeneMANIA_weight':'GeneMANIA_weight:double', 'Source':'Source:string[]'#,'Description':'Description:string',
                                   #'Synonym':'Synonym:string[]',
                                   }, inplace=True)

for col in ['PubMedID:string[]','GO_evidence:string[]','Context:string[]', 'Method:string[]',
            'Interactor:string[]', 'Source:string[]'#,'Hierarchy:string[]', 'Synonym:string[]'
            ]:
    merged_ontology_kg[col] = merged_ontology_kg[col].apply(lambda x: ";".join([str(i).replace(";", ",").strip() for i in ast.literal_eval(x)])
                                                            if pd.notna(x) and isinstance(x, str) and x.startswith("[") else x)  
    merged_ontology_kg[col] = merged_ontology_kg[col].replace("", np.nan)

#merged_ontology_kg.to_csv(processed_data_location + "test_edges1.csv", index=False)
merged_ontology_kg.head(n=3)

In [None]:
avoid = ['OBO_not_expresses_RNA.csv', 'RNA_not_expressed_in_OBO.csv', 'RNA_is_causal_somatic_mutation_in_OBO.csv','test_nodes.csv',
         'RNA_in_similarity_relationship_with_RNA.csv', 'RNA_causally_influenced_by_COSMIC.csv', 'merged_ontology_and_lod_kg.csv',
         'COSMIC_causally_influences_RNA.csv', 'RNA_interacts_with_RNA.csv', 'test_edges1.csv', 'test.csv', 'nodes.csv', 'test_nodes.csv',
         'test_edges1.csv', 'test_edges2.csv', 'test_edges3.csv', 'test_edges4.csv', 'test_edges5.csv', 'test_edges.csv']
lst = []

for filename in tqdm(os.listdir(processed_data_location)):
    if filename.endswith('.csv') and filename not in avoid:
        print("Processing:", filename)
        lst.append(pd.read_csv(processed_data_location + filename, low_memory=False))

df_edges2 = pd.concat(lst, ignore_index=True).drop(columns=['Unnamed: 0','Synonym','Hierarchy','Description'])

df_edges2['Source'] = df_edges2['Source'].astype(str)
df_edges2['Source'] = df_edges2['Source'].str.replace("\"", "", regex=True)
df_edges2['Source'] = df_edges2['Source'].str.replace("[", "")
df_edges2['Source'] = df_edges2['Source'].str.replace("]", "")
df_edges2['Source'] = df_edges2['Source'].str.split(", ")
df_edges2['Source'] = df_edges2['Source'].apply(lambda x: list(x)).apply(json.dumps)
df_edges2['Knockdown_percentage'] = df_edges2['Knockdown_percentage'].astype(str)
df_edges2['Knockdown_percentage'] = df_edges2['Knockdown_percentage'].str.replace("\"", "", regex=True)
df_edges2['Knockdown_percentage'] = df_edges2['Knockdown_percentage'].str.replace("[", "")
df_edges2['Knockdown_percentage'] = df_edges2['Knockdown_percentage'].str.replace("]", "")
df_edges2['Exon'] = df_edges2['Exon'].astype(str)
df_edges2['Exon'] = df_edges2['Exon'].str.replace("\"", "", regex=True)
df_edges2['Exon'] = df_edges2['Exon'].str.replace("[", "")
df_edges2['Exon'] = df_edges2['Exon'].str.replace("]", "")
									
df_edges2.rename(columns={'Regulator':'Regulator:string[]','Interactor':'Interactor:string[]','Source':'Source:string[]',
                          'PubMedID':'PubMedID:string[]','Method':'Method:string[]','Context':'Context:string[]',
                          'Drug':'Drug:string[]',#'Synonym':'Synonym:string[]','Hierarchy':'Hierarchy:string[]',
                          'Mutation':'Mutation:string[]','GO_evidence':'GO_evidence:string[]','Exon':'Exon:string',
                          'RNAsister_score':'RNAsister_score:double','zScore':'zScore:double','Distance':'Distance:double',
                          'Maximum_RPM':'Maximum_RPM:double','Binding_pos':'Binding_pos:string[]','microT_score':'microT_score:double',
                          'Knockdown_percentage':'Knockdown_percentage:string','Number_of_oligos':'Number_of_oligos:long',
                          'Position':'Position:string[]', 'TPM':'TPM:double', 'FPKM':'FPKM:double', 'RCI':'RCI:double',
                          'Abundance':'Abundance:double', 'FDR':'FDR:double', 'p-value':'p-value:double', 'GeneMANIA_weight':'GeneMANIA_weight:double',
                          'Fold_Change':'Fold_Change:double', 'miRDB_score':'miRDB_score:double', 'Weighted_CS_score':'Weighted_CS_score:double',
                          'TANRIC_score':'TANRIC_score:double','Minimum_free_energy_kcal_mol':'Minimum_free_energy_kcal_mol:double',
                          'log2FC':'log2FC:double'#,'Description':'Description:string'
                          }, inplace=True)

for col in ['Regulator:string[]','Interactor:string[]','Source:string[]', 'PubMedID:string[]','Method:string[]','Context:string[]','Binding_pos:string[]',
            'Drug:string[]'#,'Synonym:string[]','Hierarchy:string[]'
            , 'Mutation:string[]','GO_evidence:string[]','Position:string[]']:
    df_edges2[col] = df_edges2[col].apply(lambda x: ";".join([str(i).replace(";", ",").strip() for i in ast.literal_eval(x)])
                                                            if pd.notna(x) and isinstance(x, str) and x.startswith("[") else x)  
    df_edges2[col] = df_edges2[col].replace("", np.nan)

df_edges2['Number_of_oligos:long'] = df_edges2['Number_of_oligos:long'].astype('Int64') 

#df_edges2.to_csv(processed_data_location + "test_edges2.csv", index=False)
df_edges2.head(n=3)

In [None]:
df_edges3 = pd.read_csv(processed_data_location + 'RNA_interacts_with_RNA.csv', low_memory=False).drop(columns=['Unnamed: 0','Description','Synonym','Hierarchy'])
df_edges3.rename(columns={'RNAsister_score':'RNAsister_score:double'}, inplace=True)
df_edges3['Source'] = df_edges3['Source'].astype(str)
df_edges3['Source'] = df_edges3['Source'].str.replace("\"", "", regex=True)
df_edges3['Source'] = df_edges3['Source'].str.replace("[", "")
df_edges3['Source'] = df_edges3['Source'].str.replace("]", "")
df_edges3['Source'] = df_edges3['Source'].str.split(", ")
df_edges3['Source'] = df_edges3['Source'].apply(lambda x: list(x)).apply(json.dumps)
df_edges3.rename(columns={'Source':'Source:string[]', 'PubMedID':'PubMedID:string[]',
                   #'Description':'Description:string','Synonym':'Synonym:string[]','Hierarchy':'Hierarchy:string[]',
                   'Context':'Context:string[]','miTG_score':'miTG_score:double','Drug':'Drug:string[]','Mutation':'Mutation:string[]',
                   'Method':'Method:string[]','FDR':'FDR:double','Distance':'Distance:double','RNAsister_score':'RNAsister_score:double'}, inplace=True)
for col in ['Source:string[]','PubMedID:string[]','Context:string[]','Drug:string[]',
            #'Synonym:string[]','Hierarchy:string[]',
            'Mutation:string[]','Method:string[]']:
    df_edges3[col] = df_edges3[col].apply(lambda x: ";".join([str(i).replace(";", ",").strip() for i in ast.literal_eval(x)])
                                                            if pd.notna(x) and isinstance(x, str) and x.startswith("[") else x)  
    df_edges3[col] = df_edges3[col].replace("", np.nan)

#df_edges3.to_csv(processed_data_location + 'test_edges3.csv', index=False)
df_edges3.head(n=3)

In [None]:
df_edges4 = pd.concat([pd.read_csv(processed_data_location + 'RNA_causally_influenced_by_COSMIC.csv', low_memory=False),
               pd.read_csv(processed_data_location + 'COSMIC_causally_influences_RNA.csv', low_memory=False)]).drop(
                   columns=['Unnamed: 0', 'Description','Synonym','Hierarchy'])
df_edges4['Source'] = df_edges4['Source'].astype(str)
df_edges4['Source'] = df_edges4['Source'].str.replace("\"", "", regex=True)
df_edges4['Source'] = df_edges4['Source'].str.replace("[", "")
df_edges4['Source'] = df_edges4['Source'].str.replace("]", "")
df_edges4['Source'] = df_edges4['Source'].str.split(", ")
df_edges4['Source'] = df_edges4['Source'].apply(lambda x: list(x)).apply(json.dumps)
df_edges4.rename(columns={'Source':'Source:string[]', 'PubMedID':'PubMedID:string[]', 'Interactor':'Interactor:string[]',
                   #'Description':'Description:string','Synonym':'Synonym:string[]','Hierarchy':'Hierarchy:string[]',
                   'Context':'Context:string[]'}, inplace=True)
for col in ['Source:string[]','PubMedID:string[]','Interactor:string[]','Context:string[]'#,'Synonym:string[]','Hierarchy:string[]'
            ]:
    df_edges4[col] = df_edges4[col].apply(lambda x: ";".join([str(i).replace(";", ",").strip() for i in ast.literal_eval(x)])
                                                            if pd.notna(x) and isinstance(x, str) and x.startswith("[") else x)  
    df_edges4[col] = df_edges4[col].replace("", np.nan)

#df_edges4.to_csv(processed_data_location + 'test_edges4.csv', index=False)
df_edges4.head(n=3)

In [None]:
df_edges5 = pd.concat([pd.read_csv(processed_data_location + 'RNA_is_causal_somatic_mutation_in_OBO.csv', low_memory=False),
                pd.read_csv(processed_data_location + 'RNA_in_similarity_relationship_with_RNA.csv', low_memory=False),
                pd.read_csv(processed_data_location + 'OBO_not_expresses_RNA.csv', low_memory=False),
                pd.read_csv(processed_data_location + 'RNA_not_expressed_in_OBO.csv', low_memory=False)]).drop(columns=['Unnamed: 0',
                                                                                                                        'Description','Synonym','Hierarchy'])
df_edges5['Source'] = df_edges5['Source'].astype(str)
df_edges5['Source'] = df_edges5['Source'].str.replace("\"", "", regex=True)
df_edges5['Source'] = df_edges5['Source'].str.replace("[", "")
df_edges5['Source'] = df_edges5['Source'].str.replace("]", "")
df_edges5['Source'] = df_edges5['Source'].str.split(", ")
df_edges5['Source'] = df_edges5['Source'].apply(lambda x: list(x)).apply(json.dumps)
df_edges5.rename(columns={'Source':'Source:string[]', 'PubMedID':'PubMedID:string[]', 'Interactor':'Interactor:string[]',
                  # 'Description':'Description:string','Synonym':'Synonym:string[]','Hierarchy':'Hierarchy:string[]',
                   'Rfam_score':'Rfam_score:double','Mutation':'Mutation:string[]'}, inplace=True)
for col in ['Source:string[]','PubMedID:string[]','Interactor:string[]','Mutation:string[]'#,'Synonym:string[]','Hierarchy:string[]'
            ]:
    df_edges5[col] = df_edges5[col].apply(lambda x: ";".join([str(i).replace(";", ",").strip() for i in ast.literal_eval(x)])
                                                            if pd.notna(x) and isinstance(x, str) and x.startswith("[") else x)  
    df_edges5[col] = df_edges5[col].replace("", np.nan)
    
#df_edges5.to_csv(processed_data_location + 'test_edges5.csv', index=False)
df_edges5.head(n=3)

In [None]:
df_edges = pd.concat([merged_ontology_kg, df_edges2, df_edges3, df_edges4, df_edges5])#.drop(columns=['Description:string','Synonym:string[]','Hierarchy:string[]'])
df_edges.drop_duplicates(subset=[':START_ID', ':TYPE', ':END_ID'], keep='first', inplace=True)
df_edges.to_csv(processed_data_location + 'test_edges.csv', index=False)

df_nodes = df_nodes[df_nodes['URI:ID'].isin(df_edges[':START_ID']) | df_nodes['URI:ID'].isin(df_edges[':END_ID'])]
df_nodes.to_csv(processed_data_location + "test_nodes.csv", index=False)

In [15]:
df_nodes

Unnamed: 0,:LABEL,Description,Label,Genomic_coordinates:string[],Sequence,URI:ID,Species,Structure,Mutation:string[],Synonym:string[],ID:string,FDA_indications,Charge:long,Mass:double,SMILES,Formula,InChIKey,CAS
0,RNA;ncRNA;lncRNA,homo sapiens (human) znf451 regulatory antisen...,ZNF451 regulatory antisense RNA 1,chr6:57173736-57174236-;chr6:57171005-57174236...,AUGGAGAGAUGUGUGUUACUUGUUAUGUGGCUCCCUAAAAAGAAAC...,https://rnacentral.org/rna/URS0000000055_9606,Homo sapiens,,,,RNAcentral:URS0000000055_9606,,,,,,,
1,RNA;ncRNA;sncRNA;Small_regulatory_ncRNA;piRNA,homo sapiens (human) pir-50304,piR-50304,,UGCAACCAGUGUCUCUGCCUACCCGAUCCU,https://rnacentral.org/rna/URS0000000096_9606,Homo sapiens,,,,RNAcentral:URS0000000096_9606,,,,,,,
2,RNA;ncRNA,homo sapiens (human) ncrna,ENSG00000251803,,GGCUGGUCUGAAGGUAGUGAGUUAUCUCAAUUGAUUGUUCACCGUC...,https://rnacentral.org/rna/URS0000000098_9606,Homo sapiens,,,,RNAcentral:URS0000000098_9606,,,,,,,
3,RNA;ncRNA;lncRNA,homo sapiens (human) mef2c antisense rna 1,MEF2C antisense RNA 1,chr5:88889335-88967279+;chr5:88889335-88889480...,CUGCUCUCAUCACCUAUAUACUCUUCUCUCUGCCCGUCUCUGCUUC...,https://rnacentral.org/rna/URS00000000C9_9606,Homo sapiens,,,,RNAcentral:URS00000000C9_9606,,,,,,,
4,RNA;ncRNA;sncRNA;Small_regulatory_ncRNA;piRNA,homo sapiens (human) pir-55009,piR-55009,,UGGCCCAGGAGGCCUCAAGGGCCCGGUGUU,https://rnacentral.org/rna/URS00000000D2_9606,Homo sapiens,,,,RNAcentral:URS00000000D2_9606,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6553991,Drug,,,,,https://go.drugbank.com/drugs/DB16012,,,,,DrugBank:DB16012,,,,,,,
6553992,Chemical,,"{4-[(1E)-3-oxo-3-{2,3,4-trihydroxy-5-[hydroxy(...",,,http://purl.obolibrary.org/obo/CHEBI_194051,,,,,CHEBI:194051,,0,694.61000,C=1C=C(C=CC1\C(\[H])=C(/[H])\C(C=2C(C(C(C3C(C(...,C27H34O19S,GPIKPRCHBDWSMS-ZZXKWVIFSA-N,
6553993,Chemical,,4-tert-butylbenzoic acid [2-[(2-methoxy-3-dibe...,,,http://purl.obolibrary.org/obo/CHEBI_114246,,,,,CHEBI:114246,,0,431.48100,CC(C)(C)C1=CC=C(C=C1)C(=O)OCC(=O)NC2=C(C=C3C4=...,C26H25NO5,FUBUELHMJDMBQJ-UHFFFAOYSA-N,
6553994,Chemical,,tetrakis(pyridine)silver(2+),,,http://purl.obolibrary.org/obo/CHEBI_30343,,,,,CHEBI:30343,,2,424.26796,C1=CC=[N](C=C1)[Ag++]([N]1=CC=CC=C1)([N]1=CC=C...,C20H20AgN4,OINDLVQQDGASSD-UHFFFAOYSA-N,


In [16]:
df_edges

Unnamed: 0,:START_ID,:TYPE,:END_ID,PubMedID:string[],GO_evidence:string[],TPM:double,Context:string[],Method:string[],FDR:double,RNAsister_score:double,Interactor:string[],p-value:double,GeneMANIA_weight:double,Source:string[],Regulator:string[],Drug:string[],Mutation:string[],zScore:double,Distance:double,Maximum_RPM:double,Binding_pos:string[],microT_score:double,Knockdown_percentage:string,Number_of_oligos:long,Exon:string,Position:string[],FPKM:double,RCI:double,Abundance:double,Fold_Change:double,miRDB_score:double,Weighted_CS_score:double,TANRIC_score:double,Minimum_free_energy_kcal_mol:double,log2FC:double,miTG_score:double,Rfam_score:double
0,http://bigdata.ibp.ac.cn/SmProt/SmProt.php?ID=...,subclassof,http://purl.obolibrary.org/obo/PR_000018263,,,,,,,,,,,Entity_linking,,,,,,,,,,,,,,,,,,,,,,,
1,http://bigdata.ibp.ac.cn/SmProt/SmProt.php?ID=...,subclassof,http://purl.obolibrary.org/obo/SO_0000104,,,,,,,,,,,Entity_linking,,,,,,,,,,,,,,,,,,,,,,,
2,http://bigdata.ibp.ac.cn/SmProt/SmProt.php?ID=...,subclassof,http://purl.obolibrary.org/obo/PR_000018263,,,,,,,,,,,Entity_linking,,,,,,,,,,,,,,,,,,,,,,,
3,http://bigdata.ibp.ac.cn/SmProt/SmProt.php?ID=...,subclassof,http://purl.obolibrary.org/obo/SO_0000104,,,,,,,,,,,Entity_linking,,,,,,,,,,,,,,,,,,,,,,,
4,http://bigdata.ibp.ac.cn/SmProt/SmProt.php?ID=...,subclassof,http://purl.obolibrary.org/obo/PR_000018263,,,,,,,,,,,Entity_linking,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1653796,https://rnacentral.org/rna/URS0002349D57_9606,not_expressed_in,http://purl.obolibrary.org/obo/MONDO_0005108,,,,,,,,,,,LncBook,,,,,,,,,,,,,,,,,,,,,,,
1653797,https://rnacentral.org/rna/URS0002349D58_9606,not_expressed_in,http://purl.obolibrary.org/obo/GO_0007623,,,,,,,,,,,LncBook,,,,,,,,,,,,,,,,,,,,,,,
1653798,https://rnacentral.org/rna/URS0002349D58_9606,not_expressed_in,http://purl.obolibrary.org/obo/GO_0030154,,,,,,,,,,,LncBook,,,,,,,,,,,,,,,,,,,,,,,
1653799,https://rnacentral.org/rna/URS0002349D58_9606,not_expressed_in,http://purl.obolibrary.org/obo/GO_0051179,,,,,,,,,,,LncBook,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
df_nodes.to_pickle("test_nodes.pkl")

In [None]:
df_edges.to_pickle("test_edges.pkl")
merged_ontology_kg.to_pickle("test_edges1.pkl")
df_edges2.to_pickle("test_edges2.pkl")
df_edges3.to_pickle("test_edges3.pkl")
df_edges4.to_pickle("test_edges4.pkl")
df_edges5.to_pickle("test_edges5.pkl")

***
### Remove unprocessed raw data
Uncomment the following line if you want to delete the `unprocessed_data` subfolder.

In [None]:
#shutil.rmtree(unprocessed_data_location)


<br>

***
***

```
@misc{cavalleri_e_2024_rna_kg,
  author       = {Cavalleri, E},
  title        = {RNA-KG},
  year         = 2024,
  doi          = {10.5281/zenodo.10078876},
  url          = {https://doi.org/10.5281/zenodo.10078876}
}
```