# <p style="text-align: center;">RNA-KG node properties</p>
    
***
***

**Authors:** [ECavalleri](https://mail.google.com/mail/u/0/?view=cm&fs=1&tf=1&to=emanuele.cavalleri@unimi.it)

**GitHub Repositories:** [RNA-KG](https://github.com/AnacletoLAB/RNA-KG/)
  
<br>  
  
**Purpose:** This notebook serves as a script to add properties to entities within the RNA-centered Knowledge Graph.

<br>

**Assumptions:**   
- Property data write location ➞ `./resources/property_data`  
- Ontologies ➞ `./resources/ontologies`    
- Processed data write location ➞ `./resources/processed_data`  

<br>

**Dependencies:**   
- **Scripts**: This notebook utilizes several helper functions, which are stored in the [`data_utils.py`](https://github.com/callahantiff/PheKnowLator/blob/master/pkt_kg/utils/data_utils.py) and [`kg_utils.py`](https://github.com/callahantiff/PheKnowLator/blob/master/pkt_kg/utils/kg_utils.py) scripts.  
- **Data**: All downloaded and generated data sources are provided through [10.5281/zenodo.10078876](https://zenodo.org/doi/10.5281/zenodo.10078876) dedicated repository. <u>This notebook will download everything that is needed for you</u>.  
_____
***

In [None]:
%%capture
import sys
!{sys.executable} -m pip install -r requirements.txt
sys.path.append('../')

In [None]:
# import needed libraries
import datetime
import glob
import itertools
import networkx
import numpy
import os
import pickle
import re
import requests
import tarfile
import shutil
import pandas as pd
import gffpandas.gffpandas as gffpd
import numpy as np
pd.set_option('display.max_columns', None)
import re

from collections import Counter
from functools import reduce
from rdflib import Graph, Namespace, URIRef, BNode, Literal
from rdflib.namespace import OWL, RDF, RDFS
from reactome2py import content
from tqdm import tqdm
from typing import Dict

from pkt_kg.utils import * 
from builds.ontology_cleaning import *

from typing import Tuple

In [None]:
# directory to store resources
resource_data_location = '../resources/'

# directory to use for unprocessed data
unprocessed_data_location = '../resources/processed_data/unprocessed_data/'

# directory to use for processed data
processed_data_location = '../resources/processed_data/'

# directory to write ontology data to
ontology_data_location = '../resources/ontologies/'

# directory to write edges data to
edge_data_location = '../resources/edge_data/'

# directory to write node properties to
properties_location = '../resources/property_data/'

# processed data url 
processed_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/processed_data/'

# original data url 
original_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/original_data/'

# owltools location
owltools_location = '../pkt_kg/libs/owltools'

***
# pre-miRNA

In [None]:
from Bio import SeqIO

data_downloader('https://www.mirbase.org/download/miRNA.dat', processed_data_location)

# Open the EMBL file
embl_file = processed_data_location + 'miRNA.dat'

# Create empty lists to store the data
data = {
    "ID": [],
    "Description": [],
    "Sequence": [],
    "Comments": [],
    "References": [],
    "Feature Table": []
}

# Iterate through the records in the EMBL file
for record in SeqIO.parse(embl_file, "embl"):
    data["ID"].append(record.id)
    data["Description"].append(record.description)
    data["Sequence"].append(str(record.seq))
    data["Comments"].append(str(record.annotations.get('comment', '')))
    references = []
    i = 0
    for ref in record.annotations.get('references', []):
        i = i + 1
        references.append(f"{[i], ref.pubmed_id}")
    data["References"].append(", ".join(references))
    feature_table = "\n".join(str(feature) for feature in record.features)
    data["Feature Table"].append(feature_table)

df = pd.DataFrame(data)
df = df[df['Description'].astype(str).str.contains('Homo sapiens')]

df['Feature Table'] = df['Feature Table'].str.split("type: miRNA")
df = df.explode('Feature Table')
df = df[df['Feature Table'] != '']
df

In [None]:
df['Feature Table'] = df['Feature Table'].str.split("\n")
list(df['Feature Table'].loc[57])

In [None]:
def extract_values(row):
    result = {}
    for item in row:
        if "location: " in item:
            key_value = item.split("location: ")
            value = key_value[1]
            result['location'] = value
        elif "Key: " in item:
            key_value = item.split("Key: ")
            key = key_value[1].split(", Value:")[0].strip()
            value = key_value[1].split(", Value:")[1].strip(" ['").strip("'']")
            result[key] = value
    return pd.Series(result)

# Apply the function to create new columns
new_columns = df['Feature Table'].apply(extract_values)

# Concatenate the new columns with the original DataFrame
df = pd.concat([df, new_columns], axis=1)
df

In [None]:
premirna = df[['ID', 'Description', 'Sequence', 'Comments', 'References', 'mod_base']]
premirna = premirna.rename(columns={'mod_base':'Modification'})
premirna

In [None]:
miRBaseMap = pd.read_csv(processed_data_location + 'MIRNA_MIRBASE_MAP.txt', header=None, sep='\t')
miRBaseMap

In [None]:
premirna = pd.merge(df, miRBaseMap, left_on=['ID'], right_on=[1])
premirna['Label'] = premirna[0]
premirna

In [None]:
miRNA_variant = pd.read_csv(unprocessed_data_location + "miRNet-snp-mir-hsa.csv?dl=0")
miRNA_variant = miRNA_variant[miRNA_variant['High_Confidence']=='YES']
miRNA_variant = miRNA_variant[['MIRNA_Name','Family_Name']]
miRNA_variant = pd.merge(miRNA_variant, miRBaseMap, left_on=['MIRNA_Name'], right_on=[0]).drop(columns=['MIRNA_Name',0])

miRNA_variant

In [None]:
premirna = pd.merge(premirna, miRNA_variant, left_on=['ID'], right_on=[1], how='outer').rename(columns={'Family_Name':'Family name'})
premirna[['ID','Label','Description','Sequence','Family name','Comments','References']].drop_duplicates().to_csv(properties_location + 'premiRNA.csv', index=None)

***
# miRNA

In [None]:
mirna = df.drop(columns=['ID']).rename(columns={'accession':'ID',
                                                'location':'Location',
                                                'evidence':'Evidence',
                                                'experiment':'Experiment',
                                                'product':'Label'})
mirna['Experiment'] = mirna['Experiment'] + ']'
mirna.evidence = mirna.Evidence.replace('experimental',
                                        'http://purl.obolibrary.org/obo/NCIT_C43622 (experimental method)')
mirna = mirna[['ID','Label','References','Location','Evidence','Experiment']]
mirna

In [None]:
mirna.drop_duplicates().to_csv(properties_location + 'miRNA.csv', index=None)

***
# tsRNA

In [None]:
tsRNA = pd.read_csv(unprocessed_data_location + 'newID_20210202.txt', sep="\t")  
tsRNA

In [None]:
tsRNA[['tsRNAid', 'seq']].drop_duplicates().to_csv(properties_location + 'tsRNA.csv', index=None)

***
# tRF

## tRFdb

In [None]:
#http://genome.bioch.virginia.edu/trfdb/index.php
tRF1_tRNA = pd.read_html(unprocessed_data_location+'trf1.html')[2]
tRF1_tRNA.drop(columns=['Organism'],inplace=True)
tRF1_tRNA.head()

tRF3_tRNA = pd.read_html(unprocessed_data_location+'trf3.html')[2]
tRF3_tRNA.drop(columns=['Organism'],inplace=True)

tRF5_tRNA = pd.read_html(unprocessed_data_location+'trf5.html')[2]
tRF5_tRNA.drop(columns=['Organism'],inplace=True)

tRF_tRNA = pd.concat([tRF1_tRNA,tRF3_tRNA,tRF5_tRNA])
tRF_tRNA = tRF_tRNA.drop(columns=['Experiment Info', 'Sequence'])
tRF_tRNA['tRF ID'] = tRF_tRNA['tRF ID'].astype(str)
tRF_tRNA

In [None]:
import re

def get_numbers(identifier):
    
    html_file_path = unprocessed_data_location + 'trf' + identifier + '.html'

    with open(html_file_path, 'r', encoding='utf-8') as html_file:
        html_content = html_file.read()

    pattern = r'href=\'sequence_display.php\?seq_id=(\d+)'
    matches = re.findall(pattern, html_content)
    numbers = [int(match) for match in matches]

    pattern2 = r"href='experiments_display.php\?trf_id=(.*?)'"
    matches2 = re.findall(pattern2, html_content)
    
    # Return the numbers as a dictionary
    return {'sequence_numbers': numbers, 'experiment_numbers': matches2}

In [None]:
def transform(original_html):

    transformed_html = re.sub(r'<font face=', '\n<font face=', original_html)
    transformed_html = re.sub(r'<br><b>Organism:', "</font><br>\n<font face='Arial' size='2'><b>Organism:", transformed_html)
    transformed_html = re.sub(r'<br><b>tRF Sequence:', "</font><br>\n<font face='Arial' size='2'><b>tRF Sequence:", transformed_html)
    transformed_html = re.sub(r"<font face='Courier' size='3'>", "</font><br>\n<font face='Arial' size='2'>", transformed_html)
    transformed_html = re.sub(r"<br><b>Map Position:", "\n<font face='Arial' size='2'><b>Map Position:", transformed_html)

    return transformed_html

In [None]:
import requests

def get_html(identifier):
    url = 'http://genome.bioch.virginia.edu/trfdb/sequence_display.php?seq_id=' + identifier
    response = requests.get(url)
    if response.status_code == 500:
        html_content = response.text
        return html_content

In [None]:
from bs4 import BeautifulSoup

df = pd.DataFrame()
result = get_numbers('1')
numbers_mapping = dict(zip(result['sequence_numbers'], result['experiment_numbers']))

for i in result['sequence_numbers'] :
    
    html_content = get_html(str(i))  # Retrieve HTML content
    if html_content is not None:
        # Apply the transformation to the HTML content
        transformed_html = transform(html_content)

        # Continue with parsing and DataFrame creation
        soup = BeautifulSoup(transformed_html, 'html.parser')
        values = [font.get_text() for font in soup.find_all('font')]
        values = [value.split(":")[1].strip() if ":" in value else value for value in values]
        
        corresponding_experiment_number = numbers_mapping.get(i, None)

        # Create a DataFrame for the current HTML page
        temp = pd.DataFrame(values).T
        temp.columns = range(temp.shape[1])

        # Add the 'Experiment Number' column
        temp['Experiment Number'] = corresponding_experiment_number

        # Concatenate the current DataFrame with the main DataFrame
        df = pd.concat([df, temp], ignore_index=True)
 
result = get_numbers('3')
numbers_mapping = dict(zip(result['sequence_numbers'], result['experiment_numbers']))

for i in result['sequence_numbers'] :
    
    html_content = get_html(str(i))  # Retrieve HTML content
    if html_content is not None:
        # Apply the transformation to the HTML content
        transformed_html = transform(html_content)

        # Continue with parsing and DataFrame creation
        soup = BeautifulSoup(transformed_html, 'html.parser')
        values = [font.get_text() for font in soup.find_all('font')]
        values = [value.split(":")[1].strip() if ":" in value else value for value in values]
        
        corresponding_experiment_number = numbers_mapping.get(i, None)

        # Create a DataFrame for the current HTML page
        temp = pd.DataFrame(values).T
        temp.columns = range(temp.shape[1])

        # Add the 'Experiment Number' column
        temp['Experiment Number'] = corresponding_experiment_number

        # Concatenate the current DataFrame with the main DataFrame
        df = pd.concat([df, temp], ignore_index=True)

result = get_numbers('5')
numbers_mapping = dict(zip(result['sequence_numbers'], result['experiment_numbers']))

for i in result['sequence_numbers'] :
    
    html_content = get_html(str(i))  # Retrieve HTML content
    if html_content is not None:
        # Apply the transformation to the HTML content
        transformed_html = transform(html_content)

        # Continue with parsing and DataFrame creation
        soup = BeautifulSoup(transformed_html, 'html.parser')
        values = [font.get_text() for font in soup.find_all('font')]
        values = [value.split(":")[1].strip() if ":" in value else value for value in values]
        
        corresponding_experiment_number = numbers_mapping.get(i, None)

        # Create a DataFrame for the current HTML page
        temp = pd.DataFrame(values).T
        temp.columns = range(temp.shape[1])

        # Add the 'Experiment Number' column
        temp['Experiment Number'] = corresponding_experiment_number

        # Concatenate the current DataFrame with the main DataFrame
        df = pd.concat([df, temp], ignore_index=True)        

In [None]:
def extract_chr_substring(text):
    start_index = text.find('chr')
    if start_index != -1:
        end_index = text.find('&', start_index)
        if end_index != -1:
            return text[start_index:end_index]
    return ''

df['Experiment Number'] = df['Experiment Number'].apply(extract_chr_substring)
df.columns = ['tRF ID','organism','empty','Sequence','Map Position','tRNA Gene Co-ordinates']
df = df.drop(columns=['organism','empty'])
df

In [None]:
tRF = pd.merge(tRF_tRNA,df,on=['tRF ID', 'tRNA Gene Co-ordinates'])
tRF

In [None]:
tRF['tRF ID'] = "trfdb?" + tRF['tRF ID'].astype(str)

In [None]:
tRF.drop_duplicates().to_csv(properties_location + 'tRF_tRFdb.csv', index=None)

## MINTBASE

In [None]:
tRNA_MINTbase_GtRNAdb_map=pd.read_csv(
    processed_data_location + 'tRNA_MINTbase_GtRNAdb_MAP.txt', header=None, sep='\t')
tRNA_MINTbase_GtRNAdb_map=tRNA_MINTbase_GtRNAdb_map.rename(columns={0:'MINTbase tRNA name',1:'gtRNAdb name'})
tRNA_MINTbase_GtRNAdb_map

In [None]:
# https://cm.jefferson.edu/MINTbase/InputController?g=GRCh37&d=y&v=g&e=1.0&cl=,4,5,11,12,16,18,19,21,22,26,27,#ttop
tRF_tRNA2 = pd.read_csv(unprocessed_data_location+'MINTbasetRF-tRNA.txt',sep='\t')
tRF_tRNA2['MINTbase Alternative IDs (GRCh37 assembly-derived)'] = tRF_tRNA2['MINTbase Alternative IDs (GRCh37 assembly-derived)'].str.split('@').str[0]
tRF_tRNA2.rename(columns={'MINTbase Alternative IDs (GRCh37 assembly-derived)':'MINTbase tRNA name'},inplace=True)
tRF_tRNA2 = pd.merge(tRF_tRNA2, tRNA_MINTbase_GtRNAdb_map, on='MINTbase tRNA name')
tRF_tRNA2 = tRF_tRNA2[['License Plate (sequence derived)','Type','Fragment sequence','gtRNAdb name','MINTbase tRNA name']]
tRF_tRNA2

In [None]:
tRF_tRNA2.drop_duplicates().to_csv(properties_location + 'tRF_MINTBASE.csv', index=None)

***
# tRNA

In [None]:
! wget http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/hg38-tRNAs.fa -P ../resources/processed_data/unprocessed_data/

In [None]:
from Bio.SeqIO.FastaIO import SimpleFastaParser

identifiers = []
seq = []

# Replace the URL with the path to your local FASTA file
fasta_file_path = unprocessed_data_location + 'hg38-tRNAs.fa'

with open(fasta_file_path) as fasta_file:
    for title, sequence in SimpleFastaParser(fasta_file):
        identifiers.append(title.split(None, 1)[0])  # First word is ID
        seq.append(sequence)
        
data = {"Identifier": identifiers, "Sequence": seq}
df = pd.DataFrame(data)
df

In [None]:
all(df['Identifier'].str.startswith('Homo_sapiens_'))

In [None]:
df['Identifier'] = df['Identifier'].str[len('Homo_sapiens_'):]
df

In [None]:
tRNA = pd.read_html('http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/genes/tRNA-Ala-AGC-1-1.html')[0].T
tRNA2 = pd.read_html('http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/genes/tRNA-Ala-AGC-1-1.html')[1].T
tRNA = pd.concat([tRNA,tRNA2],axis=1)
tRNA.columns = tRNA.iloc[0]
tRNA = tRNA[1:]
tRNA

In [None]:
for identifier in df['Identifier'] [1:] :

    temp = pd.read_html('http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/genes/' + identifier + '.html')[0].T
    temp2 = pd.read_html('http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/genes/' + identifier + '.html')[1].T
    temp = pd.concat([temp,temp2],axis=1)
    temp.columns = temp.iloc[0]
    temp = temp[1:]
    tRNA = pd.concat([tRNA, temp])

tRNA.Locus = tRNA.Locus.str.replace(' View in Genome Browser', '')
tRNA = tRNA.drop(columns=['Organism', 'Known Modifications (Modomics)'])

tRNA['GtRNAdb Gene Symbol'] = tRNA['GtRNAdb Gene Symbol'].astype(str) + '.html'
tRNA = tRNA[['GtRNAdb Gene Symbol'] + [col for col in tRNA.columns if col != 'GtRNAdb Gene Symbol']]
tRNA

In [None]:
tRNA.drop_duplicates().to_csv(properties_location + 'tRNA.csv', index=None)

***
# Small protein

In [None]:
lncRNA_protein = pd.read_csv(unprocessed_data_location + 'sprotein_LncBook2.0.csv.gz') 
lncRNA_protein = lncRNA_protein[lncRNA_protein['Symbol']!='-']
lncRNA_protein.drop(columns=['Gene ID','Symbol','Transcript ID','Experimental Evidence'],inplace=True)
lncRNA_protein

In [None]:
lncRNA_protein.drop_duplicates().to_csv(properties_location + 'smallProtein.csv', index=None)

***
# Riboswitch

## TBDB

In [None]:
riboswitch_protein = pd.read_csv(unprocessed_data_location+'tbdb.csv', sep=',') 
riboswitch_protein = riboswitch_protein[[
    'unique_name', 'Name', 'Sequence', 'Tbox_start' , 'Tbox_end', 'Structure', 's1_start', 's1_loop_start',
    's1_loop_end', 's1_end', 'antiterm_start', 'antiterm_end', 'term_start', 'term_end', 'codon_start',
    'codon_end', 'codon', 'codon_region', 'discrim_start', 'discrim_end', 'discriminator', 'warnings',
    'type', 'source', 'whole_antiterm_structure', 'other_stems', 'whole_antiterm_warnings', 'term_sequence',
    'term_structure', 'terminator_energy', 'term_errors', 'antiterm_term_sequence',
    'infernal_antiterminator_structure', 'vienna_antiterminator_structure', 'vienna_antiterminator_energy',
    'vienna_antiterminator_errors', 'terminator_structure', 'terminator_errors', 'new_term_structure',
    'new_term_energy', 'new_term_errors', 'whole_term_structure', 'folded_antiterm_structure', 'Trimmed_sequence',
    'Trimmed_antiterm_struct', 'Trimmed_term_struct', 'accession_url', 'accession_name', 'locus_start', 
    'locus_end', 'locus_view_start', 'locus_view_end', 'deltadelta_g', 'TaxId'
]]
riboswitch_protein

In [None]:
riboswitch_protein.drop_duplicates().to_csv(properties_location + 'riboswitch_TBDB.csv', index=None)

## RSwitch

In [None]:
riboswitch_bactStrain = pd.read_csv(unprocessed_data_location + 'rswitch.csv', header=None) 
riboswitch_bactStrain.rename(columns={0:'Riboswitch',1:'Type'},inplace=True)
riboswitch_bactStrain

In [None]:
riboswitch_bactStrain[['Riboswitch','Type']].drop_duplicates().to_csv(properties_location + 'riboswitch_RSwitch.csv', index=None)

***
# Viral RNA

In [None]:
vRNA_ribozyme = pd.read_json(unprocessed_data_location + 'all.json').T 

# Extract ribozymes 
myre = re.compile(r"\n>> .*?\n")
ribozyme = [myre.findall(i) for i in vRNA_ribozyme.ribozymes]
ribozyme = [[j.replace("\n",'').replace(">> ",'') for j in i] for i in ribozyme]

# List of all possible ribozymes (useful for mapping)
a = [i for j in ribozyme for i in j]
set(a)

vRNA_ribozyme = pd.concat([vRNA_ribozyme.reset_index().drop(columns=['index']),
                           pd.Series(ribozyme)], axis=1)
vRNA_ribozyme = vRNA_ribozyme.explode(0)
vRNA_ribozyme[0] = vRNA_ribozyme[0].str.split().str[0]
vRNA_ribozyme=vRNA_ribozyme[['accession', 'identicalSeqs', 'submitters', 'releaseDate', 'isolate', 'species',
                            'genus', 'family', 'moleculeType', 'sequenceType', 'nucCompleteness', 'genotype', 'segment',
                            'publications', 'geoLocation', 'host', 'isolationSource', 'collectionDate', 'bioSample',
                            'genBankTitle', 'displayTitle', 'sequence', 'structure', 'type', 'ribozymes',
                            'Cls_ID80', 'Cls_ID70', 'Cls_ID85', 'Cls_ID75', 'Cls_ID95', 'Cls_ID90']]

vRNA_ribozyme['identicalSeqs'] = vRNA_ribozyme['identicalSeqs'].astype(str)
vRNA_ribozyme['structure'] = vRNA_ribozyme['structure'].astype(str)
vRNA_ribozyme

In [None]:
vRNA_ribozyme.drop_duplicates().to_csv(properties_location + 'viralRNA.csv', index=None)

***
# Aptamer

## Apta-Index

In [None]:
aptamer_protein = pd.read_csv(unprocessed_data_location + 'aptaindex.csv',names=['Name', 'ID', 'Target', 'Sequence'],skiprows=[0]) 
aptamer_protein.Target = aptamer_protein.Target.str.lower()
aptamer_protein['ID'] = 'aptamer-details/?id=' + aptamer_protein['ID'].astype(str)
aptamer_protein = aptamer_protein.drop(columns=['Target'])
aptamer_protein = aptamer_protein[['ID','Name','Sequence']]
aptamer_protein

In [None]:
aptamer_protein.drop_duplicates().to_csv(properties_location + 'aptamer.csv', index=None)

***
# Ribozyme

## Rfam

In [None]:
ribozyme_rfam_map = pd.DataFrame(data=[['LC ribozyme','family/RF00011'],
                                 ['hammerhead ribozyme','clan/CL00010'],
                                 ['glmS ribozyme','family/RF00234'],
                                 ['HDV-F-prausnitzii','family/RF02682'],
                                 ['HDV ribozyme','family/RF00094'],
                                 ['HDV_ribozyme','family/RF00094'],
                                 ['Hairpin','family/RF00173'],
                                 ['Hammerhead_1','clan/CL00010'],
                                 ['Hammerhead_HH9','clan/CL00010'],
                                 ['Hammerhead_3','clan/CL00010'],
                                 ['Hammerhead_HH10','clan/CL00010'],
                                 ['Hammerhead_II','clan/CL00010'],
                                 ['Pistol','family/RF02679'],
                                 ['Pistol ribozyme','family/RF02679'],
                                 ['twister ribozyme','clan/CL00120'],
                                 ['Twister-P5','clan/CL00120'],
                                 ['Twister-P3','clan/CL00120'],
                                 ['RNAse P','family/RF00009']#,
                                 #['VS ribozyme',''] absent in RFAM
                                 ])

ribozyme_rfam_map

In [None]:
import requests
from Bio import SeqIO
from io import StringIO

ribozyme_family = ribozyme_rfam_map[ribozyme_rfam_map[1].str.contains('family')]
ribozyme_sequences = {}

for ribozyme in ribozyme_family[1]:
    url = 'http://rfamlive.xfam.org/' + ribozyme + '/alignment?acc=' + ribozyme.rsplit('/')[1] + '&format=fasta&download=1'
    response = requests.get(url)
    fasta_data = response.text
    fasta_handle = StringIO(fasta_data)
    sequences = list(SeqIO.parse(fasta_handle, 'fasta'))
    ribozyme_sequences[ribozyme] = sequences

In [None]:
data = []
for ribozyme, seq_records in ribozyme_sequences.items():
    sequences = [str(seq_record.seq) for seq_record in seq_records]
    data.append({'ribozyme': ribozyme, 'sequence(s)': sequences})

# Create a Pandas DataFrame from the list of dictionaries
df = pd.DataFrame(data)
df

In [None]:
ribozyme_rfam_map = pd.merge(ribozyme_rfam_map,df,left_on=[1],right_on=['ribozyme'], how='outer').drop(columns=['ribozyme'])
ribozyme_rfam_map['sequence(s)'] = ribozyme_rfam_map['sequence(s)'].apply(
    lambda x: '; '.join(map(str, x)) if not isinstance(x, float) else '')
ribozyme_rfam_map.rename(columns={0:'Label',1:'Rfam ID'},inplace=True)
ribozyme_rfam_map = ribozyme_rfam_map[['Rfam ID', 'Label', 'sequence(s)']]
ribozyme_rfam_map

In [None]:
ribozyme_rfam_map.drop_duplicates().to_csv(properties_location + 'ribozyme.csv', index=None)

***
# Biological role

In [None]:
name = ['Tumor-Suppressor-Gene', 'Oncogene', 'General']
definition = ['A tumor suppressor gene encodes a protein that acts to regulate cell division, keeping it in check. When a tumor suppressor gene is inactivated by a mutation, the protein it encodes is not produced or does not function properly, and as a result, uncontrolled cell division may occur. Such mutations may contribute to the development of a cancer.',
              'An oncogene is a mutated gene that has the potential to cause cancer. Before an oncogene becomes mutated, it is called a proto-oncogene, and it plays a role in regulating normal cell division. Cancer can arise when a proto-oncogene is mutated, changing it into an oncogene and causing the cell to divide and multiply uncontrollably. Some oncogenes work like an accelerator pedal in a car, pushing a cell to divide again and again. Others work like a faulty brake in a car parked on a hill, also causing the cell to divide unchecked.',
              '']
narration = ['Tumor Suppressor Gene. Tumor suppressor genes are present in all cells in our body. When they are switched on, they prevent ourselves from growing and dividing. You can think of them as being like the brakes of a car. However, when a tumor suppressor gene is switched off, either because the cell mistakenly deletes it or mutates it, the brake is released and the cell may start to grow and divide uncontrollably and potentially drive the cell to turn into a cancer cell.',
             'Oncogene. The name of oncogene suggests it is a gene that can cause cancer. Initially, oncogenes were identified in viruses, which could cause cancers in animals. Later, it was found that oncogenes can be mutated copies of certain normal cellular genes also called proto-oncogenes. Intact proto-oncogenes play important functions, regulating normal cellular growth, division, and apoptosis, which is the name for programmed or controlled cell death. Oncogenes or mutated copies of the proto-oncogenes may lead to uncontrolled cell growth and the escape from cell death, which may result in cancer development.',
             '']

In [None]:
role = pd.DataFrame({'Name': name, 'Definition': definition, 'Narration': narration})
role

In [None]:
role.drop_duplicates().to_csv(properties_location + 'biologicalRole.csv', index=None)

***
# piRNA

## piRBase

In [None]:
# http://bigdata.ibp.ac.cn/piRBase/browse.php --> "Download" button
piRNA = pd.read_csv(unprocessed_data_location + 'piRBase_hsa.txt', sep='\t')[['name','aliases','accession','sequence','dataset','pubmed']]
piRNA

In [None]:
piRNA['aliases'] = piRNA['aliases'].str.replace(',', '|')

piRNA.accession = 'https://www.ncbi.nlm.nih.gov/nucleotide/' + piRNA.accession.astype(str)
piRNA.accession = piRNA.accession.replace('https://www.ncbi.nlm.nih.gov/nucleotide/nan', np.nan)

# Remove number of reads per dataset
piRNA.dataset = piRNA.dataset.str.replace(r':\d+', '', regex=True)
piRNA.dataset = 'http://bigdata.ibp.ac.cn/piRBase/browseds2.php?dsid=' + piRNA.dataset.astype(str)
piRNA.dataset = piRNA.dataset.str.replace(' ', '|http://bigdata.ibp.ac.cn/piRBase/browseds2.php?dsid=')
piRNA.dataset = piRNA.dataset.replace('http://bigdata.ibp.ac.cn/piRBase/browseds2.php?dsid=nan', np.nan)

piRNA.pubmed = 'https://pubmed.ncbi.nlm.nih.gov/' + piRNA.pubmed.astype(str)
piRNA.pubmed = piRNA.pubmed.str.replace(' ', '|https://pubmed.ncbi.nlm.nih.gov/')
piRNA.pubmed = piRNA.pubmed.replace('https://pubmed.ncbi.nlm.nih.gov/nan', np.nan)

piRNA = piRNA.rename(columns={'name':'Name','aliases':'Synonym(s)','accession':'Accession','sequence':'Sequence','dataset':'Dataset','pubmed':'References (PMID)'})
piRNA

In [None]:
piRNAfix = list(range(8438265, 8592950))
piRNAfix = ['piR-hsa-' + str(num) for num in range(8438265, 8592950)]
piRNAfix = pd.DataFrame({'Name': piRNAfix})
piRNAfix["Synonym(s)"] = np.nan
piRNAfix["Accession"] = np.nan
piRNAfix["Sequence"] = np.nan
piRNAfix["Dataset"] = np.nan
piRNAfix["References (PMID)"] = np.nan
piRNAfix

In [None]:
piRNA = pd.concat([piRNA, piRNAfix])
piRNA

In [None]:
piRNA.drop_duplicates().to_csv(properties_location + 'piRNA.csv', index=None)

***
# RNA drugs 

https://go.drugbank.com/releases/latest#open-data --> DrugBank Vocabulary --> Download

In [None]:
DrugBank = pd.read_csv(unprocessed_data_location + 'drugbank vocabulary.csv') 

In [None]:
ASOdnonO_data = pd.concat([pd.read_csv('../resources/edge_data/ASOd-mRNA2430.txt',sep='\t')['DrugBank ID'],
    pd.read_csv('../resources/edge_data/ASOd-disease2606.txt',sep='\t')['DB ID'],
    pd.read_csv('../resources/edge_data/ASOd-protein11007.txt',sep='\t')['DrugBank ID'],
    pd.read_csv('../resources/edge_data/ASOd-protein10002.txt',sep='\t')['DrugBank ID']])

aptamerdnonO_data = pd.concat([pd.read_csv('../resources/edge_data/aptamerd-protein2436.txt',sep='\t')['DrugBank ID'],
    pd.read_csv('../resources/edge_data/aptamerd-disease2606.txt',sep='\t')['DrugBank ID']])

siRNAdnonO_data = pd.concat([pd.read_csv('../resources/edge_data/siRNAd-mRNA2430.txt',sep='\t')['DrugBank ID'],
    pd.read_csv('../resources/edge_data/siRNAd-disease2606.txt',sep='\t')['DrugBank ID']])

mRNAvnonO_data = pd.read_csv('../resources/edge_data/mRNAv-disease2606.txt',sep='\t')['DrugBank ID']

RNAdrugs = pd.concat([ASOdnonO_data, aptamerdnonO_data, siRNAdnonO_data, mRNAvnonO_data]).drop_duplicates().reset_index(drop=True)
RNAdrugs.head()

In [None]:
RNAdrugs = pd.merge(pd.DataFrame(RNAdrugs), DrugBank, left_on=[0], right_on=['DrugBank ID']).drop(columns=[0])

In [None]:
RNAdrugs.drop_duplicates().to_csv(properties_location + 'RNAdrugs.csv', index=None)

***
# Gene

## PheKnowLator

In [None]:
merged_data_clean = pd.read_csv(processed_data_location + 'Merged_Human_Ensembl_Entrez_HGNC_Uniprot_Identifiers.txt', sep='\t')
merged_data_clean = merged_data_clean[(~merged_data_clean['entrez_id'].isna()) & (merged_data_clean['entrez_id'] != 'None')]

def merge_rows(df, column1):
    df = df.drop_duplicates()
    df_merged = df.groupby([column1]).agg(lambda x: '|'.join(set(str(i) for i in x if i != 'None' and i != 'unknown'))).reset_index()
    return df_merged.drop_duplicates()

merged_data_clean = merge_rows(merged_data_clean, 'entrez_id')
merged_data_clean

In [None]:
merged_data_clean.drop_duplicates().to_csv(properties_location + 'gene.csv', index=None)

***
# Reactome pathways

In [None]:
kegg_reactome_map = pd.read_csv(unprocessed_data_location + 'kegg_reactome.csv', header=0, delimiter=',')[['Source Name','Source ID']]
kegg_reactome_map.columns=[0,1]

reactome_pathways = pd.read_csv(unprocessed_data_location + 'ReactomePathways.txt', header=None, delimiter='\t')
# remove all non-human pathways
reactome_pathways = reactome_pathways[reactome_pathways[2] == 'Homo sapiens'][[0,1]]
reactome_pathways.columns=[1,0]

desc_reactome_map = pd.concat([kegg_reactome_map, reactome_pathways]).drop_duplicates()
desc_reactome_map[1] =  'https://reactome.org/content/detail/' + desc_reactome_map[1].astype(str)

desc_reactome_map

In [None]:
desc_reactome_map.to_csv(properties_location + 'Reactome.csv', index=None)

***
# Wikipathways

In [None]:
desc_wpw_map = pd.read_csv(unprocessed_data_location + 'wpw_reactome.csv', delimiter='\t', names=range(587))[[0,1]]
desc_wpw_map[0] = desc_wpw_map[0].str.replace(r'%WikiPathways_.*$', '', regex=True)

desc_wpw_map

In [None]:
desc_wpw_map.to_csv(properties_location + 'Wikipathways.csv', index=None)

***
# OBO terms/classes and properties
***

***
# RO

In [None]:
ro_graph = Graph()
ro_graph.parse(ontology_data_location + 'ro_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(ro_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')

# get ontology information
cls = [x for x in gets_ontology_classes(ro_graph)] #+\
      #[x for x in gets_object_properties(ro_graph)]
master_synonyms = [x for x in ro_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in ro_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0]) if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in ro_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym
    }
    
pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'RO.csv')

***
# HPO

In [None]:
hpo_graph = Graph()
hpo_graph.parse(ontology_data_location + 'hp_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(hpo_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')
dbxref_uri = URIRef("http://www.geneontology.org/formats/oboInOwl#hasDbXref")

# get ontology information
cls = [x for x in gets_ontology_classes(hpo_graph)] #+\
      #[x for x in gets_object_properties(hpo_graph)]
master_synonyms = [x for x in hpo_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in hpo_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0])  if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in hpo_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # DbXref
    cls_ed = [x for x in hpo_graph.objects(x, dbxref_uri) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = '|'.join([str(c) for c in cls_ed]) if len(cls_ed) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym, 'DbXref': desc_ed
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'HPO.csv')

***
# GO

In [None]:
go_graph = Graph()
go_graph.parse(ontology_data_location + 'go_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(go_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')
hasOBONamespace = URIRef("http://www.geneontology.org/formats/oboInOwl#hasOBONamespace")

# get ontology information
cls = [x for x in gets_ontology_classes(go_graph)] #+\
      #[x for x in gets_object_properties(go_graph)]
master_synonyms = [x for x in go_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in go_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0]) if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(cls_syn)]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in go_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # vocabulary(MF/BP/CC)
    cls_ed = [x for x in go_graph.objects(x, hasOBONamespace) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = str(cls_ed[0]) if len(cls_ed) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym': synonym, 'Vocabulary(MF/BP/CC)': desc_ed
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'GO.csv')

***
# Mondo

In [None]:
mondo_graph = Graph()
mondo_graph.parse(ontology_data_location + 'mondo_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(mondo_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')
dbxref_uri = URIRef("http://www.geneontology.org/formats/oboInOwl#hasDbXref")

# get ontology information
cls = [x for x in gets_ontology_classes(mondo_graph)] #+\
      #[x for x in gets_object_properties(mondo_graph)]
master_synonyms = [x for x in mondo_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in mondo_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0])  if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in mondo_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # DbXref
    cls_ed = [x for x in mondo_graph.objects(x, dbxref_uri) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = '|'.join([str(c) for c in cls_ed]) if len(cls_ed) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym, 'DbXref': desc_ed
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'Mondo.csv')

***
# VO

In [None]:
vo_graph = Graph()
vo_graph.parse(ontology_data_location + 'vo_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(vo_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')

# get ontology information
cls = [x for x in gets_ontology_classes(vo_graph)] #+\
      #[x for x in gets_object_properties(vo_graph)]
master_synonyms = [x for x in vo_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in vo_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0]) if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in vo_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # seeAlso
    cls_seeAlso = [x for x in vo_graph.objects(x, RDFS.seeAlso) if '@' not in n3(x) or '@en' in n3(x)]
    seeAlsos = str(cls_seeAlso[0]) if len(cls_seeAlso) > 0 else 'None'
    # editor notes
    cls_ed = [x for x in vo_graph.objects(x, obo.IAO_0000116) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = '|'.join([str(cls_ed[0])]) if len(cls_ed) > 0 else 'None'
    # vaccine proper name
    cls_pn = [x for x in vo_graph.objects(x, obo.VO_0003158) if '@' not in n3(x) or '@en' in n3(x)]
    desc_pn = str(cls_pn[0]) if len(cls_pn) > 0 else 'None'
    # definition source
    cls_ds = [x for x in vo_graph.objects(x, obo.IAO_0000119) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ds = str(cls_ds[0]) if len(cls_ds) > 0 else 'None'  
    # alternative label
    cls_al = [x for x in vo_graph.objects(x, obo.IAO_0000118) if '@' not in n3(x) or '@en' in n3(x)]
    desc_al = str(cls_al[0]) if len(cls_al) > 0 else 'None' 
    # FDA indications
    cls_fi = [x for x in vo_graph.objects(x, obo.VO_0003160) if '@' not in n3(x) or '@en' in n3(x)]
    desc_fi = str(cls_fi[0]) if len(cls_fi) > 0 else 'None' 
    # trade name
    cls_td = [x for x in vo_graph.objects(x, obo.VO_0003099) if '@' not in n3(x) or '@en' in n3(x)]
    desc_td = str(cls_td[0]) if len(cls_td) > 0 else 'None'
    # example of usage
    cls_eu = [x for x in vo_graph.objects(x, obo.IAO_0000112) if '@' not in n3(x) or '@en' in n3(x)]
    desc_eu = str(cls_eu[0]) if len(cls_eu) > 0 else 'None'
    # vaccine STN
    cls_stn = [x for x in vo_graph.objects(x, obo.VO_0003162) if '@' not in n3(x) or '@en' in n3(x)]
    desc_stn = str(cls_stn[0]) if len(cls_stn) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'AlternativeLabel':desc_al, 'seeAlso': seeAlsos, 'TradeName': desc_td,
        'Description': desc, 'DefinitionSource': desc_ds, 'Synonym(s)': synonym, 'EditorNotes': desc_ed,
        'VaccineProperName': desc_pn, 'FDAindications': desc_fi, 'ExampleOfUsage': desc_eu,
        'vaccineSTN': desc_stn
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'VO.csv')

***
# ChEBI

In [None]:
chebi_graph = Graph()
chebi_graph.parse(ontology_data_location + 'chebi_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(chebi_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')
hasOBONamespace = URIRef("http://www.geneontology.org/formats/oboInOwl#hasOBONamespace")
dbxref_uri = URIRef("http://www.geneontology.org/formats/oboInOwl#hasDbXref")
iupacName = URIRef("http://purl.obolibrary.org/obo/chebi#IUPAC_NAME")
charge = URIRef("http://purl.obolibrary.org/obo/chebi/charge")
mass = URIRef("http://purl.obolibrary.org/obo/chebi/mass")
smiles = URIRef("http://purl.obolibrary.org/obo/chebi/smiles")
formula = URIRef("http://purl.obolibrary.org/obo/chebi/formula")
monoisotopicmass = URIRef("http://purl.obolibrary.org/obo/chebi/monoisotopicmass")
inchi = URIRef("http://purl.obolibrary.org/obo/chebi/inchi")
inchikey = URIRef("http://purl.obolibrary.org/obo/chebi/inchikey")

# get ontology information
cls = [x for x in gets_ontology_classes(chebi_graph)] #+\
      #[x for x in gets_object_properties(chebi_graph)]
master_synonyms = [x for x in chebi_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in chebi_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0])  if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in chebi_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # DbXref
    cls_ed = [x for x in chebi_graph.objects(x, dbxref_uri) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = '|'.join([str(c) for c in cls_ed]) if len(cls_ed) > 0 else 'None'
    # vocabulary
    cls_vo = [x for x in chebi_graph.objects(x, hasOBONamespace) if '@' not in n3(x) or '@en' in n3(x)]
    desc_vo = str(cls_vo[0]) if len(cls_vo) > 0 else 'None'
    # IUPAC name
    cls_iupac = [x for x in chebi_graph.objects(x, iupacName) if '@' not in n3(x) or '@en' in n3(x)]
    desc_iupac = str(cls_iupac[0]) if len(cls_iupac) > 0 else 'None'
    # charge
    cls_ch = [x for x in chebi_graph.objects(x, charge) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ch = str(cls_ch[0]) if len(cls_ch) > 0 else 'None'
    # mass
    cls_mass = [x for x in chebi_graph.objects(x, mass) if '@' not in n3(x) or '@en' in n3(x)]
    desc_mass = str(cls_mass[0]) if len(cls_mass) > 0 else 'None'
    # smiles
    cls_smiles = [x for x in chebi_graph.objects(x, smiles) if '@' not in n3(x) or '@en' in n3(x)]
    desc_smiles = str(cls_smiles[0]) if len(cls_smiles) > 0 else 'None'
    # formula
    cls_form = [x for x in chebi_graph.objects(x, formula) if '@' not in n3(x) or '@en' in n3(x)]
    desc_form = str(cls_form[0]) if len(cls_form) > 0 else 'None'
    # monoisotopicmass
    cls_mim = [x for x in chebi_graph.objects(x, monoisotopicmass) if '@' not in n3(x) or '@en' in n3(x)]
    desc_mim = str(cls_mim[0]) if len(cls_mim) > 0 else 'None'
    # inchi
    cls_in = [x for x in chebi_graph.objects(x, inchi) if '@' not in n3(x) or '@en' in n3(x)]
    desc_in = str(cls_in[0]) if len(cls_in) > 0 else 'None'
    # inchikey
    cls_ink = [x for x in chebi_graph.objects(x, inchikey) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ink = str(cls_ink[0]) if len(cls_ink) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym, 'DbXref': desc_ed,
        'Namespace': desc_vo, 'IUPACname': desc_iupac, 'Charge': desc_ch, 'Mass': desc_mass,
        'Smiles': desc_smiles, 'Monoisotopicmass': desc_mim, 'Inchi': desc_in, 'Inchikey': desc_ink
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'ChEBI.csv')

***
# Uberon

In [None]:
uberon_graph = Graph()
uberon_graph.parse(ontology_data_location + 'ext_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(uberon_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')
dbxref_uri = URIRef("http://www.geneontology.org/formats/oboInOwl#hasDbXref")

# get ontology information
cls = [x for x in gets_ontology_classes(uberon_graph)] #+\
      #[x for x in gets_object_properties(uberon_graph)]
master_synonyms = [x for x in uberon_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in uberon_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0])  if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in uberon_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # DbXref
    cls_ed = [x for x in uberon_graph.objects(x, dbxref_uri) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = '|'.join([str(c) for c in cls_ed]) if len(cls_ed) > 0 else 'None'
    # external definition
    cls_extd = [x for x in uberon_graph.objects(x, obo.UBPROP_0000001) if '@' not in n3(x) or '@en' in n3(x)]
    desc_extd = str(cls_extd[0]) if len(cls_extd) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym,
        'DbXref': desc_ed, 'ExternalDefinition': desc_extd
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'Uberon.csv')

***
# CLO

In [None]:
clo_graph = Graph()
clo_graph.parse(ontology_data_location + 'clo_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(clo_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')
comment = URIRef("http://www.w3.org/2000/01/rdf-schema#comment")
seeAlso = URIRef("http://www.w3.org/2000/01/rdf-schema#seeAlso")
depictedBy = URIRef("http://xmlns.com/foaf/0.1/depicted_by")

# get ontology information
cls = [x for x in gets_ontology_classes(clo_graph)] #+\
      #[x for x in gets_object_properties(clo_graph)]
master_synonyms = [x for x in clo_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in clo_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0])  if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in clo_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # DbXref
    cls_ed = [x for x in clo_graph.objects(x, dbxref_uri) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = '|'.join([str(c) for c in cls_ed]) if len(cls_ed) > 0 else 'None'
    # comment
    cls_com = [x for x in clo_graph.objects(x, comment) if '@' not in n3(x) or '@en' in n3(x)]
    desc_com = '|'.join([str(c) for c in cls_com]) if len(cls_com) > 0 else 'None'
    # seeAlso
    cls_sa = [x for x in clo_graph.objects(x, seeAlso) if '@' not in n3(x) or '@en' in n3(x)]
    desc_sa = '|'.join([str(c) for c in cls_sa]) if len(cls_sa) > 0 else 'None'
    # depicted by
    cls_db = [x for x in clo_graph.objects(x, depictedBy) if '@' not in n3(x) or '@en' in n3(x)]
    desc_db = '|'.join([str(c) for c in cls_db]) if len(cls_db) > 0 else 'None'
    # example of usage
    cls_eou = [x for x in clo_graph.objects(x, obo.IAO_0000112) if '@' not in n3(x) or '@en' in n3(x)]
    desc_eou = str(cls_eou[0]) if len(cls_eou) > 0 else 'None'
    # definition source
    cls_ds = [x for x in clo_graph.objects(x, obo.IAO_0000119) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ds = str(cls_ds[0]) if len(cls_ds) > 0 else 'None'
    # alternative term
    cls_at = [x for x in clo_graph.objects(x, obo.IAO_0000118) if '@' not in n3(x) or '@en' in n3(x)]
    desc_at = '|'.join([str(c) for c in cls_at]) if len(cls_at) > 0 else 'None'
    # IEDB alternative term
    cls_iedb = [x for x in clo_graph.objects(x, obo.OBI_9991118) if '@' not in n3(x) or '@en' in n3(x)]
    desc_iedb = str(cls_iedb[0]) if len(cls_iedb) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym, 'DbXref': desc_ed,
        'Comment': desc_com, 'SeeAlso': desc_sa, 'DepictedBy': desc_db, 'ExampleOfUsage': desc_eou,
        'DefinitionSource': desc_ds, 'AlternativeTerm': desc_at, 'IEDBalternativeTerm': desc_iedb
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'CLO.csv')

***
# PRO

In [None]:
pro_graph = Graph()
pro_graph.parse(ontology_data_location + 'pr_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(pro_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')
comment = URIRef("http://www.w3.org/2000/01/rdf-schema#comment")
seeAlso = URIRef("http://www.w3.org/2000/01/rdf-schema#seeAlso")
depictedBy = URIRef("http://xmlns.com/foaf/0.1/depicted_by")
orth = URIRef("http://purl.obolibrary.org/obo/pr#PRO-short-label")
syn2 = URIRef("http://purl.obolibrary.org/obo/pr#Gene-based")

# get ontology information
cls = [x for x in gets_ontology_classes(pro_graph)] #+\
      #[x for x in gets_object_properties(pro_graph)]
master_synonyms = [x for x in pro_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in pro_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0])  if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in pro_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # DbXref
    cls_ed = [x for x in pro_graph.objects(x, dbxref_uri) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = '|'.join([str(c) for c in cls_ed]) if len(cls_ed) > 0 else 'None'
    # comment
    cls_com = [x for x in pro_graph.objects(x, comment) if '@' not in n3(x) or '@en' in n3(x)]
    desc_com = '|'.join([str(c) for c in cls_com]) if len(cls_com) > 0 else 'None'
    # unique short label for PRO terms for display purposes; based on orthology
    cls_orth = [x for x in pro_graph.objects(x, orth) if '@' not in n3(x) or '@en' in n3(x)]
    desc_orth = str(cls_orth[0]) if len(cls_orth) > 0 else 'None'
    # synonyms based on current or previous gene name, ORF name, or ordered locus name
    cls_syn2 = [x for x in pro_graph.objects(x, syn2) if '@' not in n3(x) or '@en' in n3(x)]
    desc_syn2 = '|'.join([str(c) for c in cls_syn2]) if len(cls_syn2) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym, 'DbXref': desc_ed,
        'Comment': desc_com, 'UniqueShortLabel(orthology-based)': desc_orth,
        'Synonym(s)(unusedName/ORF/locus-based)': desc_syn2,
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'PRO.csv')

***
# SO

In [None]:
so_graph = Graph()
so_graph.parse(ontology_data_location + 'so_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(so_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')

# get ontology information
cls = [x for x in gets_ontology_classes(so_graph)] #+\
      #[x for x in gets_object_properties(so_graph)]
master_synonyms = [x for x in so_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in so_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0])  if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in so_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # DbXref
    cls_ed = [x for x in so_graph.objects(x, dbxref_uri) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = '|'.join([str(c) for c in cls_ed]) if len(cls_ed) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym, 'DbXref': desc_ed
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'SO.csv')

***
# PW

In [None]:
pw_graph = Graph()
pw_graph.parse(ontology_data_location + 'pw_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(pw_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')

# get ontology information
cls = [x for x in gets_ontology_classes(pw_graph)] #+\
      #[x for x in gets_object_properties(pw_graph)]
master_synonyms = [x for x in pw_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in pw_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0]) if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in pw_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym
    } 

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'PW.csv')