In [None]:
%%capture
import sys
!{sys.executable} -m pip install -r requirements.txt
sys.path.append('../')

In [None]:
# import needed libraries
import datetime
import glob
import itertools
import networkx
import numpy
import os
import pickle
import re
import requests
import tarfile
import shutil
import pandas as pd
import gffpandas.gffpandas as gffpd
import numpy as np
pd.set_option('display.max_columns', None)
import re

from collections import Counter
from functools import reduce
from rdflib import Graph, Namespace, URIRef, BNode, Literal
from rdflib.namespace import OWL, RDF, RDFS
from reactome2py import content
from tqdm import tqdm
from typing import Dict

from pkt_kg.utils import * 
from builds.ontology_cleaning import *

from typing import Tuple

In [None]:
# directory to store resources
resource_data_location = '../resources/'

# directory to use for unprocessed data
unprocessed_data_location = '../resources/processed_data/unprocessed_data/'

# directory to use for processed data
processed_data_location = '../resources/processed_data/'

# directory to write ontology data to
ontology_data_location = '../resources/ontologies/'

# directory to write edges data to
edge_data_location = '../resources/edge_data/'

# directory to write node properties to
properties_location = '../resources/properties_data/'

# processed data url 
processed_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/processed_data/'

# original data url 
original_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/original_data/'

# owltools location
owltools_location = '../pkt_kg/libs/owltools'

# pre-miRNA

In [None]:
from Bio import SeqIO

data_downloader('https://www.mirbase.org/download/miRNA.dat', processed_data_location)

# Open the EMBL file
embl_file = processed_data_location + 'miRNA.dat'

# Create empty lists to store the data
data = {
    "ID": [],
    "Description": [],
    "Sequence": [],
    "Comments": [],
    "References": [],
    "Feature Table": []
}

# Iterate through the records in the EMBL file
for record in SeqIO.parse(embl_file, "embl"):
    data["ID"].append(record.id)
    data["Description"].append(record.description)
    data["Sequence"].append(str(record.seq))
    data["Comments"].append(str(record.annotations.get('comment', '')))
    references = []
    i = 0
    for ref in record.annotations.get('references', []):
        i = i + 1
        references.append(f"{[i], ref.pubmed_id}")
    data["References"].append(", ".join(references))
    feature_table = "\n".join(str(feature) for feature in record.features)
    data["Feature Table"].append(feature_table)

df = pd.DataFrame(data)
df = df[df['Description'].astype(str).str.contains('Homo sapiens')]

df['Feature Table'] = df['Feature Table'].str.split("type: miRNA")
df = df.explode('Feature Table')
df = df[df['Feature Table'] != '']
df

In [None]:
df['Feature Table'] = df['Feature Table'].str.split("\n")
list(df['Feature Table'].loc[57])

In [None]:
def extract_values(row):
    result = {}
    for item in row:
        if "location: " in item:
            key_value = item.split("location: ")
            value = key_value[1]
            result['location'] = value
        elif "Key: " in item:
            key_value = item.split("Key: ")
            key = key_value[1].split(", Value:")[0].strip()
            value = key_value[1].split(", Value:")[1].strip(" ['").strip("'']")
            result[key] = value
    return pd.Series(result)

# Apply the function to create new columns
new_columns = df['Feature Table'].apply(extract_values)

# Concatenate the new columns with the original DataFrame
df = pd.concat([df, new_columns], axis=1)
df

In [None]:
premirna = df[['ID', 'Description', 'Sequence', 'Comments', 'References', 'mod_base']]
premirna = premirna.rename(columns={'mod_base':'Modification'})
premirna

In [None]:
miRBaseMap = pd.read_csv(processed_data_location + 'MIRNA_MIRBASE_MAP.txt', header=None, sep='\t')
miRBaseMap

In [None]:
premirna = pd.merge(df, miRBaseMap, left_on=['ID'], right_on=[1])
premirna['Label'] = premirna[0]
premirna

In [None]:
miRNA_variant = pd.read_csv(unprocessed_data_location + "miRNet-snp-mir-hsa.csv?dl=0")
miRNA_variant = miRNA_variant[miRNA_variant['High_Confidence']=='YES']
miRNA_variant = miRNA_variant[['MIRNA_Name','Family_Name']]
miRNA_variant = pd.merge(miRNA_variant, miRBaseMap, left_on=['MIRNA_Name'], right_on=[0]).drop(columns=['MIRNA_Name',0])

miRNA_variant

In [None]:
premirna = pd.merge(premirna, miRNA_variant, left_on=['ID'], right_on=[1], how='outer').rename(columns={'Family_Name':'Family name'})
premirna[['ID','Label','Description','Sequence','Family name','Comments','References']].drop_duplicates().to_csv(properties_location + 'premiRNA.csv', index=None)

In [None]:
premirna[['ID','Label','Description','Sequence','Family name','Comments','References']].drop_duplicates()

***
# miRNA

In [None]:
mirna = df.drop(columns=['ID']).rename(columns={'accession':'ID',
                                                'location':'Location',
                                                'evidence':'Evidence',
                                                'experiment':'Experiment',
                                                'product':'Label'})
mirna['Experiment'] = mirna['Experiment'] + ']'
mirna.evidence = mirna.evidence.replace('experimental',
                                        'http://purl.obolibrary.org/obo/NCIT_C43622 (experimental method)')
mirna = mirna[['ID','Label','References','Location','Evidence','Experiment']]
mirna

In [None]:
mirna.drop_duplicates().to_csv(properties_location + 'miRNA.csv', index=None)

***
# tsRNA

## tsRFun 

In [None]:
tsRNA = pd.read_csv(unprocessed_data_location + 'newID_20210202.txt', sep="\t")  
tsRNA

***
# tRF

## tRFdb

In [None]:
#http://genome.bioch.virginia.edu/trfdb/index.php
tRF1_tRNA = pd.read_html(unprocessed_data_location+'trf1.html')[2]
tRF1_tRNA.drop(columns=['Organism'],inplace=True)
tRF1_tRNA.head()

tRF3_tRNA = pd.read_html(unprocessed_data_location+'trf3.html')[2]
tRF3_tRNA.drop(columns=['Organism'],inplace=True)

tRF5_tRNA = pd.read_html(unprocessed_data_location+'trf5.html')[2]
tRF5_tRNA.drop(columns=['Organism'],inplace=True)

tRF_tRNA = pd.concat([tRF1_tRNA,tRF3_tRNA,tRF5_tRNA])
tRF_tRNA = tRF_tRNA.drop(columns=['Experiment Info', 'Sequence'])
tRF_tRNA['tRF ID'] = tRF_tRNA['tRF ID'].astype(str)
tRF_tRNA

In [None]:
import re

def get_numbers(identifier):
    
    html_file_path = unprocessed_data_location + 'trf' + identifier + '.html'

    with open(html_file_path, 'r', encoding='utf-8') as html_file:
        html_content = html_file.read()

    pattern = r'href=\'sequence_display.php\?seq_id=(\d+)'
    matches = re.findall(pattern, html_content)
    numbers = [int(match) for match in matches]

    pattern2 = r"href='experiments_display.php\?trf_id=(.*?)'"
    matches2 = re.findall(pattern2, html_content)
    
    # Return the numbers as a dictionary
    return {'sequence_numbers': numbers, 'experiment_numbers': matches2}

In [None]:
def transform(original_html):

    transformed_html = re.sub(r'<font face=', '\n<font face=', original_html)
    transformed_html = re.sub(r'<br><b>Organism:', "</font><br>\n<font face='Arial' size='2'><b>Organism:", transformed_html)
    transformed_html = re.sub(r'<br><b>tRF Sequence:', "</font><br>\n<font face='Arial' size='2'><b>tRF Sequence:", transformed_html)
    transformed_html = re.sub(r"<font face='Courier' size='3'>", "</font><br>\n<font face='Arial' size='2'>", transformed_html)
    transformed_html = re.sub(r"<br><b>Map Position:", "\n<font face='Arial' size='2'><b>Map Position:", transformed_html)

    return transformed_html

In [None]:
import requests

def get_html(identifier):
    url = 'http://genome.bioch.virginia.edu/trfdb/sequence_display.php?seq_id=' + identifier
    response = requests.get(url)
    if response.status_code == 500:
        html_content = response.text
        return html_content

In [None]:
from bs4 import BeautifulSoup

df = pd.DataFrame()
result = get_numbers('1')
numbers_mapping = dict(zip(result['sequence_numbers'], result['experiment_numbers']))

for i in result['sequence_numbers'] :
    
    html_content = get_html(str(i))  # Retrieve HTML content
    if html_content is not None:
        # Apply the transformation to the HTML content
        transformed_html = transform(html_content)

        # Continue with parsing and DataFrame creation
        soup = BeautifulSoup(transformed_html, 'html.parser')
        values = [font.get_text() for font in soup.find_all('font')]
        values = [value.split(":")[1].strip() if ":" in value else value for value in values]
        
        corresponding_experiment_number = numbers_mapping.get(i, None)

        # Create a DataFrame for the current HTML page
        temp = pd.DataFrame(values).T
        temp.columns = range(temp.shape[1])

        # Add the 'Experiment Number' column
        temp['Experiment Number'] = corresponding_experiment_number

        # Concatenate the current DataFrame with the main DataFrame
        df = pd.concat([df, temp], ignore_index=True)
 
result = get_numbers('3')
numbers_mapping = dict(zip(result['sequence_numbers'], result['experiment_numbers']))

for i in result['sequence_numbers'] :
    
    html_content = get_html(str(i))  # Retrieve HTML content
    if html_content is not None:
        # Apply the transformation to the HTML content
        transformed_html = transform(html_content)

        # Continue with parsing and DataFrame creation
        soup = BeautifulSoup(transformed_html, 'html.parser')
        values = [font.get_text() for font in soup.find_all('font')]
        values = [value.split(":")[1].strip() if ":" in value else value for value in values]
        
        corresponding_experiment_number = numbers_mapping.get(i, None)

        # Create a DataFrame for the current HTML page
        temp = pd.DataFrame(values).T
        temp.columns = range(temp.shape[1])

        # Add the 'Experiment Number' column
        temp['Experiment Number'] = corresponding_experiment_number

        # Concatenate the current DataFrame with the main DataFrame
        df = pd.concat([df, temp], ignore_index=True)

result = get_numbers('5')
numbers_mapping = dict(zip(result['sequence_numbers'], result['experiment_numbers']))

for i in result['sequence_numbers'] :
    
    html_content = get_html(str(i))  # Retrieve HTML content
    if html_content is not None:
        # Apply the transformation to the HTML content
        transformed_html = transform(html_content)

        # Continue with parsing and DataFrame creation
        soup = BeautifulSoup(transformed_html, 'html.parser')
        values = [font.get_text() for font in soup.find_all('font')]
        values = [value.split(":")[1].strip() if ":" in value else value for value in values]
        
        corresponding_experiment_number = numbers_mapping.get(i, None)

        # Create a DataFrame for the current HTML page
        temp = pd.DataFrame(values).T
        temp.columns = range(temp.shape[1])

        # Add the 'Experiment Number' column
        temp['Experiment Number'] = corresponding_experiment_number

        # Concatenate the current DataFrame with the main DataFrame
        df = pd.concat([df, temp], ignore_index=True)        

In [None]:
def extract_chr_substring(text):
    start_index = text.find('chr')
    if start_index != -1:
        end_index = text.find('&', start_index)
        if end_index != -1:
            return text[start_index:end_index]
    return ''

df['Experiment Number'] = df['Experiment Number'].apply(extract_chr_substring)
df.columns = ['tRF ID','organism','empty','Sequence','Map Position','tRNA Gene Co-ordinates']
df = df.drop(columns=['organism','empty'])
df

In [None]:
tRF = pd.merge(tRF_tRNA,df,on=['tRF ID', 'tRNA Gene Co-ordinates'])
tRF

In [None]:
tRF.drop_duplicates().to_csv(properties_location + 'tRF_tRFdb.csv', index=None)

## MINTBASE

In [None]:
tRNA_MINTbase_GtRNAdb_map=pd.read_csv(
    processed_data_location + 'tRNA_MINTbase_GtRNAdb_MAP.txt', header=None, sep='\t')
tRNA_MINTbase_GtRNAdb_map=tRNA_MINTbase_GtRNAdb_map.rename(columns={0:'MINTbase tRNA name',1:'gtRNAdb name'})
tRNA_MINTbase_GtRNAdb_map

In [None]:
# https://cm.jefferson.edu/MINTbase/InputController?g=GRCh37&d=y&v=g&e=1.0&cl=,4,5,11,12,16,18,19,21,22,26,27,#ttop
tRF_tRNA2 = pd.read_csv(unprocessed_data_location+'MINTbasetRF-tRNA.txt',sep='\t')
tRF_tRNA2['MINTbase Alternative IDs (GRCh37 assembly-derived)'] = tRF_tRNA2['MINTbase Alternative IDs (GRCh37 assembly-derived)'].str.split('@').str[0]
tRF_tRNA2.rename(columns={'MINTbase Alternative IDs (GRCh37 assembly-derived)':'MINTbase tRNA name'},inplace=True)
tRF_tRNA2 = pd.merge(tRF_tRNA2, tRNA_MINTbase_GtRNAdb_map, on='MINTbase tRNA name')
tRF_tRNA2 = tRF_tRNA2[['Type','License Plate (sequence derived)','Fragment sequence','gtRNAdb name','MINTbase tRNA name']]
tRF_tRNA2

In [None]:
tRF_tRNA2.drop_duplicates().to_csv(properties_location + 'tRF_MINTBASE.csv', index=None)

***
# tRNA

## GtRNAdb

In [None]:
! wget http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/hg38-tRNAs.fa

In [None]:
from Bio.SeqIO.FastaIO import SimpleFastaParser

identifiers = []
seq = []

# Replace the URL with the path to your local FASTA file
fasta_file_path = unprocessed_data_location + 'hg38-tRNAs.fa'

with open(fasta_file_path) as fasta_file:
    for title, sequence in SimpleFastaParser(fasta_file):
        identifiers.append(title.split(None, 1)[0])  # First word is ID
        seq.append(sequence)
        
data = {"Identifier": identifiers, "Sequence": seq}
df = pd.DataFrame(data)
df

In [None]:
all(df['Identifier'].str.startswith('Homo_sapiens_'))

In [None]:
df['Identifier'] = df['Identifier'].str[len('Homo_sapiens_'):]
df

In [None]:
pkl = "tRNA.pkl"
#tRNA.to_pickle(pkl)
tRNA = pd.read_pickle(pkl)

In [None]:
tRNA = pd.read_html('http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/genes/tRNA-Ala-AGC-1-1.html')[0].T
tRNA2 = pd.read_html('http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/genes/tRNA-Ala-AGC-1-1.html')[1].T
tRNA = pd.concat([tRNA,tRNA2],axis=1)
tRNA.columns = tRNA.iloc[0]
tRNA = tRNA[1:]
tRNA

In [None]:
for identifier in df['Identifier'] [1:] :

    temp = pd.read_html('http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/genes/' + identifier + '.html')[0].T
    temp2 = pd.read_html('http://gtrnadb.ucsc.edu/genomes/eukaryota/Hsapi38/genes/' + identifier + '.html')[1].T
    temp = pd.concat([temp,temp2],axis=1)
    temp.columns = temp.iloc[0]
    temp = temp[1:]
    tRNA = pd.concat([tRNA, temp])

tRNA.Locus = tRNA.Locus.str.replace(' View in Genome Browser', '')
tRNA = tRNA.drop(columns=['Organism', 'Known Modifications (Modomics)'])
tRNA
tRNA

In [None]:
tRNA.drop_duplicates().to_csv(properties_location + 'tRNA_GtRNAdb.csv', index=None)

## tRNAdb

In [None]:
#http://trna.bioinf.uni-leipzig.de/DataOutput/Result
tRNA_aa = pd.read_html(unprocessed_data_location+'tRNAdb - Transfer RNA database.html')[3]
tRNA_aa.drop(columns=[0,1,2,4,19,20],inplace=True)
tRNA_aa.rename(columns=tRNA_aa.iloc[0], inplace=True)
tRNA_aa = tRNA_aa.iloc[2:]
tRNA_aa.head()

In [None]:
tRNA_aa.drop_duplicates().to_csv(properties_location + 'tRNA_tRNAdb.csv', index=None)

## tRFdb

In [None]:
#tRF1_tRNA = pd.read_html(unprocessed_data_location+'trf1.html')[2]
#tRF3_tRNA = pd.read_html(unprocessed_data_location+'trf3.html')[2]
#tRF5_tRNA = pd.read_html(unprocessed_data_location+'trf5.html')[2]
tRF_tRNA = tRF_tRNA[['tRNA Gene Co-ordinates','tRNA Name']]
#tRF_tRNA = pd.concat([tRF1_tRNA,tRF3_tRNA,tRF5_tRNA])
tRF_tRNA.head()

In [None]:
tRF_tRNA.drop_duplicates().to_csv(properties_location + 'tRNA_tRFdb.csv', index=None)

## MINTBASE

In [None]:
# https://cm.jefferson.edu/MINTbase/InputController?g=GRCh37&d=y&v=g&e=1.0&cl=,4,5,11,12,16,18,19,21,22,26,27,#ttop
tRF_tRNA2 = pd.read_csv(unprocessed_data_location+'MINTbasetRF-tRNA.txt',sep='\t')
tRF_tRNA2['MINTbase Alternative IDs (GRCh37 assembly-derived)'] = tRF_tRNA2['MINTbase Alternative IDs (GRCh37 assembly-derived)'].str.split('@').str[0]
tRF_tRNA2.rename(columns={'MINTbase Alternative IDs (GRCh37 assembly-derived)':'MINTbase tRNA name'},inplace=True)
tRF_tRNA2 = pd.merge(tRF_tRNA2, tRNA_MINTbase_GtRNAdb_map, on='MINTbase tRNA name')
tRF_tRNA2.drop(columns=['MINTbase tRNA name'],inplace=True)
tRF_tRNA2 = tRF_tRNA2[['tRNA number','Amino acid and anticodon','Chromosome','Chromosome strand','Chromosome start position','Chromosome end position','Start position relative to start of mature tRNA','End position relative to start of mature tRNA','# Distinct anticodons','# Instances in true MT tRNAs','# Instances in tRNA lookalikes in nucleus','D-loop overlap?','Anticodon-loop overlap?','Anticodon-triplet overlap?','T-loop overlap?','Exclusively within tRNA genes?','gtRNAdb name']]
tRF_tRNA2

In [None]:
tRF_tRNA2.drop_duplicates().to_csv(properties_location + 'tRNA_MINTBASE.csv', index=None)

***
# Small protein

In [None]:
lncRNA_protein = pd.read_csv(unprocessed_data_location + 'sprotein_LncBook2.0.csv.gz') 
lncRNA_protein = lncRNA_protein[lncRNA_protein['Symbol']!='-']
lncRNA_protein.drop(columns=['Gene ID','Symbol','Transcript ID','Experimental Evidence'],inplace=True)
lncRNA_protein

In [None]:
lncRNA_protein.drop_duplicates().to_csv(properties_location + 'smallProtein.csv', index=None)

***
# Riboswitch

## TBDB

In [None]:
riboswitch_protein = pd.read_csv(unprocessed_data_location+'tbdb.csv', sep=',') 
riboswitch_protein = riboswitch_protein[[
    'unique_name', 'Name', 'Sequence', 'Tbox_start' , 'Tbox_end', 'Structure', 's1_start', 's1_loop_start',
    's1_loop_end', 's1_end', 'antiterm_start', 'antiterm_end', 'term_start', 'term_end', 'codon_start',
    'codon_end', 'codon', 'codon_region', 'discrim_start', 'discrim_end', 'discriminator', 'warnings',
    'type', 'source', 'whole_antiterm_structure', 'other_stems', 'whole_antiterm_warnings', 'term_sequence',
    'term_structure', 'terminator_energy', 'term_errors', 'antiterm_term_sequence',
    'infernal_antiterminator_structure', 'vienna_antiterminator_structure', 'vienna_antiterminator_energy',
    'vienna_antiterminator_errors', 'terminator_structure', 'terminator_errors', 'new_term_structure',
    'new_term_energy', 'new_term_errors', 'whole_term_structure', 'folded_antiterm_structure', 'Trimmed_sequence',
    'Trimmed_antiterm_struct', 'Trimmed_term_struct', 'accession_url', 'accession_name', 'locus_start', 
    'locus_end', 'locus_view_start', 'locus_view_end', 'deltadelta_g', 'TaxId'
]]
riboswitch_protein

In [None]:
riboswitch_protein.drop_duplicates().to_csv(properties_location + 'riboswitch_TBDB.csv', index=None)

## RSwitch

In [None]:
riboswitch_bactStrain = pd.read_csv(unprocessed_data_location + 'rswitch.csv', header=None) 
riboswitch_bactStrain

In [None]:
riboswitch_bactStrain[[0,1]].drop_duplicates().to_csv(properties_location + 'riboswitch_RSwitch.csv', index=None)

***
# Viral RNA

## ViroidDB

In [None]:
vRNA_ribozyme = pd.read_json(unprocessed_data_location + 'all.json').T 

# Extract ribozymes 
myre = re.compile(r"\n>> .*?\n")
ribozyme = [myre.findall(i) for i in vRNA_ribozyme.ribozymes]
ribozyme = [[j.replace("\n",'').replace(">> ",'') for j in i] for i in ribozyme]

# List of all possible ribozymes (useful for mapping)
a = [i for j in ribozyme for i in j]
set(a)

vRNA_ribozyme = pd.concat([vRNA_ribozyme.reset_index().drop(columns=['index']),
                           pd.Series(ribozyme)], axis=1)
vRNA_ribozyme = vRNA_ribozyme.explode(0)
vRNA_ribozyme[0] = vRNA_ribozyme[0].str.split().str[0]
vRNA_ribozyme=vRNA_ribozyme[['identicalSeqs', 'accession', 'submitters', 'releaseDate', 'isolate', 'species',
                            'genus', 'family', 'moleculeType', 'sequenceType', 'nucCompleteness', 'genotype', 'segment',
                            'publications', 'geoLocation', 'host', 'isolationSource', 'collectionDate', 'bioSample',
                            'genBankTitle', 'displayTitle', 'sequence', 'structure', 'type', 'ribozymes',
                            'Cls_ID80', 'Cls_ID70', 'Cls_ID85', 'Cls_ID75', 'Cls_ID95', 'Cls_ID90']]

vRNA_ribozyme['identicalSeqs'] = vRNA_ribozyme['identicalSeqs'].astype(str)
vRNA_ribozyme['structure'] = vRNA_ribozyme['structure'].astype(str)
vRNA_ribozyme.insert(1,'accession',vRNA_ribozyme.pop('accession'))
vRNA_ribozyme

In [None]:
vRNA_ribozyme.drop_duplicates().to_csv(properties_location + 'viralRNA.csv', index=None)

***
# Aptamer

## Apta-Index

In [None]:
aptamer_protein = pd.read_csv(unprocessed_data_location + 'aptaindex.csv',names=['Name', 'ID', 'Target', 'Sequence'],skiprows=[0]) 
aptamer_protein.Target = aptamer_protein.Target.str.lower()
aptamer_protein['ID'] = 'aptamer-details/?id=' + aptamer_protein['ID'].astype(str)
aptamer_protein = aptamer_protein.drop(columns=['Target'])
aptamer_protein

In [None]:
aptamer_protein.drop_duplicates().to_csv(properties_location + 'aptamer.csv', index=None)

***
# Ribozyme

## Rfam

In [None]:
ribozyme_rfam_map = pd.DataFrame(data=[['LC ribozyme','family/RF00011'],
                                 ['hammerhead ribozyme','clan/CL00010'],
                                 ['glmS ribozyme','family/RF00234'],
                                 ['HDV-F-prausnitzii','family/RF02682'],
                                 ['HDV ribozyme','family/RF00094'],
                                 ['HDV_ribozyme','family/RF00094'],
                                 ['Hairpin','family/RF00173'],
                                 ['Hammerhead_1','clan/CL00010'],
                                 ['Hammerhead_HH9','clan/CL00010'],
                                 ['Hammerhead_3','clan/CL00010'],
                                 ['Hammerhead_HH10','clan/CL00010'],
                                 ['Hammerhead_II','clan/CL00010'],
                                 ['Pistol','family/RF02679'],
                                 ['Pistol ribozyme','family/RF02679'],
                                 ['twister ribozyme','clan/CL00120'],
                                 ['Twister-P5','clan/CL00120'],
                                 ['Twister-P3','clan/CL00120'],
                                 ['RNAse P','family/RF00009']#,
                                 #['VS ribozyme',''] absent in RFAM
                                 ])

ribozyme_rfam_map

In [None]:
import requests
from Bio import SeqIO
from io import StringIO

ribozyme_family = ribozyme_rfam_map[ribozyme_rfam_map[1].str.contains('family')]
ribozyme_sequences = {}

for ribozyme in ribozyme_family[1]:
    url = 'http://rfamlive.xfam.org/' + ribozyme + '/alignment?acc=' + ribozyme.rsplit('/')[1] + '&format=fasta&download=1'
    response = requests.get(url)
    fasta_data = response.text
    fasta_handle = StringIO(fasta_data)
    sequences = list(SeqIO.parse(fasta_handle, 'fasta'))
    ribozyme_sequences[ribozyme] = sequences

In [None]:
data = []
for ribozyme, seq_records in ribozyme_sequences.items():
    sequences = [str(seq_record.seq) for seq_record in seq_records]
    data.append({'ribozyme': ribozyme, 'sequence(s)': sequences})

# Create a Pandas DataFrame from the list of dictionaries
df = pd.DataFrame(data)
df

In [None]:
ribozyme_rfam_map = pd.merge(ribozyme_rfam_map,df,left_on=[1],right_on=['ribozyme'], how='outer').drop(columns=['ribozyme'])
ribozyme_rfam_map['sequence(s)'] = ribozyme_rfam_map['sequence(s)'].apply(
    lambda x: '; '.join(map(str, x)) if not isinstance(x, float) else '')
ribozyme_rfam_map

In [None]:
ribozyme_rfam_map.drop_duplicates().to_csv(properties_location + 'ribozyme.csv', index=None)

***
# Biological role

## dbEssLnc

In [None]:
name = ['Tumor-Suppressor-Gene', 'Oncogene', 'General']
definition = ['A tumor suppressor gene encodes a protein that acts to regulate cell division, keeping it in check. When a tumor suppressor gene is inactivated by a mutation, the protein it encodes is not produced or does not function properly, and as a result, uncontrolled cell division may occur. Such mutations may contribute to the development of a cancer.',
              'An oncogene is a mutated gene that has the potential to cause cancer. Before an oncogene becomes mutated, it is called a proto-oncogene, and it plays a role in regulating normal cell division. Cancer can arise when a proto-oncogene is mutated, changing it into an oncogene and causing the cell to divide and multiply uncontrollably. Some oncogenes work like an accelerator pedal in a car, pushing a cell to divide again and again. Others work like a faulty brake in a car parked on a hill, also causing the cell to divide unchecked.',
              '']
narration = ['Tumor Suppressor Gene. Tumor suppressor genes are present in all cells in our body. When they are switched on, they prevent ourselves from growing and dividing. You can think of them as being like the brakes of a car. However, when a tumor suppressor gene is switched off, either because the cell mistakenly deletes it or mutates it, the brake is released and the cell may start to grow and divide uncontrollably and potentially drive the cell to turn into a cancer cell.',
             'Oncogene. The name of oncogene suggests it is a gene that can cause cancer. Initially, oncogenes were identified in viruses, which could cause cancers in animals. Later, it was found that oncogenes can be mutated copies of certain normal cellular genes also called proto-oncogenes. Intact proto-oncogenes play important functions, regulating normal cellular growth, division, and apoptosis, which is the name for programmed or controlled cell death. Oncogenes or mutated copies of the proto-oncogenes may lead to uncontrolled cell growth and the escape from cell death, which may result in cancer development.',
             '']

In [None]:
role = pd.DataFrame({'Name': name, 'Definition': definition, 'Narration': narration})
role

In [None]:
role.drop_duplicates().to_csv(properties_location + 'biologicalRole.csv', index=None)

***
# piRNA

# RNAInter

In [None]:
piRNA = pd.concat([pd.read_csv('../resources/edge_data/piRNA-mRNA.txt',sep='\t',header=None)[0],
    pd.read_csv('../resources/edge_data/piRNA-lncRNA.txt',sep='\t',header=None)[0]]).drop_duplicates()
piRNA

In [None]:
piRNA = pd.read_html('http://bigdata.ibp.ac.cn/piRBase/pirna.php?name=piR-hsa-39980')[0][[0,1]].T
piRNA.columns = piRNA.iloc[0]
piRNA = piRNA[1:]
piRNA['piRBase Id'] = 'piR-39980'
piRNA.reset_index(drop=True, inplace=True)
piRNA

In [None]:
pd.read_html('http://bigdata.ibp.ac.cn/piRBase/pirna.php?name=piR-hsa-39980')[1]

In [None]:
pd.read_html('http://bigdata.ibp.ac.cn/piRBase/pirna.php?name=piR-hsa-39980')[2][['No.', 'Location']]

In [None]:
piRNA = pd.concat([piRNA,
           pd.read_html('http://bigdata.ibp.ac.cn/piRBase/pirna.php?name=piR-hsa-39980')[1],
           pd.read_html('http://bigdata.ibp.ac.cn/piRBase/pirna.php?name=piR-hsa-39980')[2][['No.', 'Location']]
          ], axis=1)

piRNA.reset_index(drop=True, inplace=True)
piRNA

https://www.ncbi.nlm.nih.gov/nucleotide/DQ590027 --> http://bigdata.ibp.ac.cn/piRBase/pirna.php?name=piR-hsa-57139

In [None]:
piRNA1 = piRNA.copy()

piRNA = pd.read_html('http://bigdata.ibp.ac.cn/piRBase/pirna.php?name=piR-hsa-57139')[0][[0,1]].T
piRNA.columns = piRNA.iloc[0]
piRNA = piRNA[1:]
piRNA['piRBase Id'] = 'piR-DQ590027'
piRNA.reset_index(drop=True, inplace=True)

piRNA = pd.concat([piRNA,
           pd.read_html('http://bigdata.ibp.ac.cn/piRBase/pirna.php?name=piR-hsa-57139')[1],
           pd.read_html('http://bigdata.ibp.ac.cn/piRBase/pirna.php?name=piR-hsa-57139')[2][['No.', 'Location']]
          ], axis=1)

piRNA.reset_index(drop=True, inplace=True)

piRNA = pd.concat([piRNA1,piRNA]).loc[0].drop(columns=['Length', 'Organism'])
piRNA

In [None]:
piRNA.drop_duplicates().to_csv(properties_location + 'piRNA.csv', index=None)

***
# RNA drugs

## DrugBank

https://go.drugbank.com/releases/latest#open-data --> DrugBank Vocabulary --> Download

In [None]:
DrugBank = pd.read_csv(unprocessed_data_location + 'drugbank vocabulary.csv') 

In [None]:
ASOdnonO_data = pd.concat([pd.read_csv('../resources/edge_data/RASOd-mRNA.txt',sep='\t',header=None)[1],
    pd.read_csv('../resources/edge_data/RASOd-disease.txt',sep='\t',header=None)[0],
    pd.read_csv('../resources/edge_data/RASOd-protein11007.txt',sep='\t',header=None)[0],
    pd.read_csv('../resources/edge_data/RASOd-protein10002.txt',sep='\t',header=None).loc[0]])

aptamerdnonO_data = pd.concat([pd.read_csv('../resources/edge_data/Raptamerd-protein.txt',sep='\t',header=None)[0],
    pd.read_csv('../resources/edge_data/Raptamerd-disease.txt',sep='\t',header=None)[0]])

siRNAdnonO_data = pd.concat([pd.read_csv('../resources/edge_data/RsiRNAd-mRNA.txt',sep='\t',header=None)[0],
    pd.read_csv('../resources/edge_data/RsiRNAd-disease.txt',sep='\t',header=None)[0]])

mRNAvnonO_data = pd.read_csv('../resources/edge_data/RmRNAv-disease.txt',sep='\t',header=None)[0]

RNAdrugs = pd.concat([ASOdnonO_data, aptamerdnonO_data, siRNAdnonO_data, mRNAvnonO_data]).drop_duplicates().reset_index(drop=True)
RNAdrugs.head()

In [None]:
RNAdrugs = pd.merge(pd.DataFrame(RNAdrugs), DrugBank, left_on=[0], right_on=['DrugBank ID']).drop(columns=[0])

In [None]:
RNAdrugs.drop_duplicates().to_csv(properties_location + 'RNAdrugs.csv', index=None)

***
# Gene

## PheKnowLator

In [None]:
#url = 'https://zenodo.org/records/10056198/files/genomic_typing_dict.pkl.zip?download=1'
#data_downloader(url, unprocessed_data_location)

genomic_type_mapper = pickle.load(open(unprocessed_data_location + 'genomic_typing_dict.pkl', 'rb'))
genomic_type_mapper

In [None]:
#url = 'http://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt'
#data_downloader(url, unprocessed_data_location)

# load data
hgnc = pd.read_csv(unprocessed_data_location + 'hgnc_complete_set.txt', header=0, delimiter='\t', low_memory=False)
hgnc

In [None]:
hgnc = hgnc.loc[hgnc['status'].apply(lambda x: x == 'Approved')]
hgnc = hgnc[['hgnc_id', 'entrez_id', 'ensembl_gene_id', 'uniprot_ids', 'symbol', 'locus_type', 'alias_symbol', 'name', 'location', 'alias_name']]
hgnc.rename(columns={'uniprot_ids': 'uniprot_id', 'location': 'map_location', 'locus_type': 'hgnc_gene_type'}, inplace=True)
hgnc['hgnc_id'].replace('.*\:', '', inplace=True, regex=True)  # strip 'HGNC' off of the identifiers
hgnc.fillna('None', inplace=True)  # replace NaN with 'None'
hgnc['entrez_id'] = hgnc['entrez_id'].apply(lambda x: str(int(x)) if x != 'None' else 'None')  # make col str

# combine certain columns into single column
hgnc['name'] = hgnc['name'] + '|' + hgnc['alias_name']
hgnc['synonyms'] = hgnc['alias_symbol'] + '|' + hgnc['alias_name'] + '|' + hgnc['name']
hgnc['symbol'] = hgnc['symbol'] + '|' + hgnc['alias_symbol']

# explode nested data and reformat values in preparation for combining it with other gene identifiers
explode_df_hgnc = explodes_data(hgnc.copy(), ['ensembl_gene_id', 'uniprot_id', 'symbol', 'name', 'synonyms'], '|')

# reformat hgnc gene type
for val in genomic_type_mapper['hgnc_gene_type'].keys():
    explode_df_hgnc['hgnc_gene_type'].replace(val, genomic_type_mapper['hgnc_gene_type'][val], inplace=True)

# reformat master hgnc gene type
explode_df_hgnc['master_gene_type'] = explode_df_hgnc['hgnc_gene_type']
master_dict = genomic_type_mapper['hgnc_master_gene_type']
for val in master_dict.keys():
    explode_df_hgnc['master_gene_type'].replace(val, master_dict[val], inplace=True)

# post-process reformatted data
explode_df_hgnc.drop(['alias_symbol', 'alias_name'], axis=1, inplace=True)  # remove original gene type column
explode_df_hgnc.drop_duplicates(subset=None, keep='first', inplace=True)

# preview data
explode_df_hgnc.head(n=3)

In [None]:
#url = 'ftp://ftp.ensembl.org/pub/release-102/gtf/homo_sapiens/Homo_sapiens.GRCh38.102.gtf.gz'
#data_downloader(url, unprocessed_data_location)

ensembl_geneset = pd.read_csv(unprocessed_data_location + 'Homo_sapiens.GRCh38.102.gtf',
                                  header = None, delimiter='\t', skiprows=5, usecols=[8], low_memory=False)
ensembl_geneset

In [None]:
ensembl_data = list(ensembl_geneset[8]); ensembl_df_data = []
for i in tqdm(range(0, len(ensembl_data))):
    if 'gene_id' in ensembl_data[i] and 'transcript_id' in ensembl_data[i]:
        row_dict = {x.split(' "')[0].lstrip(): x.split(' "')[1].strip('"') for x in ensembl_data[i].split(';')[0:-1]}
        ensembl_df_data += [(row_dict['gene_id'], row_dict['transcript_id'], row_dict['gene_name'],
                           row_dict['gene_biotype'], row_dict['transcript_name'], row_dict['transcript_biotype'])]
# convert to data frame
ensembl_geneset = pd.DataFrame(ensembl_df_data,
                                   columns=['ensembl_gene_id', 'transcript_stable_id', 'symbol',
                                            'ensembl_gene_type', 'transcript_name', 'ensembl_transcript_type'])

# reformat ensembl gene type
gene_dict = genomic_type_mapper['ensembl_gene_type']
for val in gene_dict.keys(): ensembl_geneset['ensembl_gene_type'].replace(val, gene_dict[val], inplace=True)
# reformat master gene type
ensembl_geneset['master_gene_type'] = ensembl_geneset['ensembl_gene_type']
gene_dict = genomic_type_mapper['ensembl_master_gene_type']
for val in gene_dict.keys(): ensembl_geneset['master_gene_type'].replace(val, gene_dict[val], inplace=True)
# reformat master transcript type
ensembl_geneset['ensembl_transcript_type'].replace('vault_RNA', 'vaultRNA', inplace=True, regex=False)
ensembl_geneset['master_transcript_type'] = ensembl_geneset['ensembl_transcript_type']
trans_dict = genomic_type_mapper['ensembl_master_transcript_type']
for val in trans_dict.keys(): ensembl_geneset['master_transcript_type'].replace(val, trans_dict[val], inplace=True)

# post-process reformatted data
ensembl_geneset.drop_duplicates(subset=None, keep='first', inplace=True)

# preview data
ensembl_geneset.head(n=3)

In [None]:
#url_uniprot = 'ftp://ftp.ensembl.org/pub/release-102/tsv/homo_sapiens/Homo_sapiens.GRCh38.102.uniprot.tsv.gz'
#data_downloader(url_uniprot, unprocessed_data_location)

ensembl_uniprot = pd.read_csv(unprocessed_data_location + 'Homo_sapiens.GRCh38.102.uniprot.tsv', header=0, delimiter='\t', low_memory=False)
ensembl_uniprot

In [None]:
ensembl_uniprot.rename(columns={'xref': 'uniprot_id', 'gene_stable_id': 'ensembl_gene_id'}, inplace=True)
ensembl_uniprot.replace('-', 'None', inplace=True)
ensembl_uniprot.fillna('None', inplace=True)
ensembl_uniprot = ensembl_uniprot.loc[ensembl_uniprot['xref_identity'].apply(lambda x: x != 'None')]
ensembl_uniprot = ensembl_uniprot.loc[ensembl_uniprot['uniprot_id'].apply(lambda x: '-' not in x)]  # remove isoforms
ensembl_uniprot = ensembl_uniprot.loc[ensembl_uniprot['info_type'].apply(lambda x: x == 'DIRECT')]
# ensembl_uniprot['master_gene_type'] = ['protein-coding'] * len(ensembl_uniprot)
# ensembl_uniprot['master_transcript_type'] = ['protein-coding'] * len(ensembl_uniprot)
ensembl_uniprot.drop(['db_name', 'info_type', 'source_identity', 'xref_identity', 'linkage_type'], axis=1, inplace=True)
ensembl_uniprot.drop_duplicates(subset=None, keep='first', inplace=True)

In [None]:
#url_entrez = 'ftp://ftp.ensembl.org/pub/release-102/tsv/homo_sapiens/Homo_sapiens.GRCh38.102.entrez.tsv.gz'
#data_downloader(url_entrez, unprocessed_data_location)

ensembl_entrez = pd.read_csv(unprocessed_data_location + 'Homo_sapiens.GRCh38.102.entrez.tsv', header=0, delimiter='\t', low_memory=False)
ensembl_entrez

In [None]:
ensembl_entrez.rename(columns={'xref': 'entrez_id', 'gene_stable_id': 'ensembl_gene_id'}, inplace=True)
ensembl_entrez = ensembl_entrez.loc[ensembl_entrez['db_name'].apply(lambda x: x == 'EntrezGene')]
ensembl_entrez = ensembl_entrez.loc[ensembl_entrez['info_type'].apply(lambda x: x == 'DEPENDENT')]
ensembl_entrez.replace('-', 'None', inplace=True)
ensembl_entrez.fillna('None', inplace=True)
ensembl_entrez.drop(['db_name', 'info_type', 'source_identity', 'xref_identity', 'linkage_type'], axis=1, inplace=True)
ensembl_entrez.drop_duplicates(subset=None, keep='first', inplace=True)

In [None]:
merge_cols = list(set(ensembl_entrez).intersection(set(ensembl_uniprot)))
ensembl_annot = pd.merge(ensembl_uniprot, ensembl_entrez, on=merge_cols, how='outer')
ensembl_annot.fillna('None', inplace=True)

ensembl_annot.head(n=3)

In [None]:
merge_cols = list(set(ensembl_annot).intersection(set(ensembl_geneset)))
ensembl = pd.merge(ensembl_geneset, ensembl_annot, on=merge_cols, how='outer')
ensembl.fillna('None', inplace=True)
ensembl.replace('NA','None', inplace=True, regex=False)

ensembl.head(n=3)

In [None]:
#url = 'https://rest.uniprot.org/uniprotkb/stream?compressed=true&download=true&fields=accession%2Cxref_geneid%2Cid%2Cxref_ensembl%2Cxref_hgnc%2Cgene_primary%2Cgene_synonym%2Corganism_id&format=tsv&query=%28%28organism_id%3A9606%29%29'
#data_downloader(url, unprocessed_data_location, 'uniprot_identifier_mapping.tab')

uniprot = pd.read_csv(unprocessed_data_location + 'uniprot_identifier_mapping.tab', header=0, delimiter='\t', compression='gzip')
uniprot

In [None]:
uniprot.fillna('None', inplace=True)  # replace NaN with 'None'
uniprot.rename(columns={'Entry': 'uniprot_id',
                        'GeneID': 'entrez_id',
                        'Ensembl': 'transcript_stable_id',
                        'HGNC': 'hgnc_id',
                        'Gene Names (synonym)': 'synonyms',
                        'Gene Names (primary)' :'symbol'}, inplace=True)

# update space-delimited synonyms to a pipe (i.e. '|')
uniprot['synonyms'] = uniprot['synonyms'].apply(lambda x: '|'.join(x.split()) if x.isupper() else x)

# only keep reviewed entries
#uniprot = uniprot.loc[uniprot['Status'].apply(lambda x: x != 'unreviewed')]

# explode nested data
explode_df_uniprot = explodes_data(uniprot.copy(), ['transcript_stable_id', 'entrez_id', 'hgnc_id'], ';')
explode_df_uniprot = explodes_data(explode_df_uniprot.copy(), ['symbol', 'synonyms'], '|')

# strip out uniprot names
explode_df_uniprot['transcript_stable_id'].replace('\s.*','', inplace=True, regex=True)

# remove duplicates
#explode_df_uniprot.drop(['Status'], axis=1, inplace=True)
explode_df_uniprot.drop(['Entry Name', 'Organism (ID)'], axis=1, inplace=True)
explode_df_uniprot.drop_duplicates(subset=None, keep='first', inplace=True)

# preview data
explode_df_uniprot.head(n=3)

In [None]:
#url = 'ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz'
#data_downloader(url, unprocessed_data_location)

ncbi_gene = pd.read_csv(unprocessed_data_location + 'Homo_sapiens.gene_info', header=0, delimiter='\t', low_memory=False)
ncbi_gene

In [None]:
# preprocess data
ncbi_gene = ncbi_gene.loc[ncbi_gene['#tax_id'].apply(lambda x: x == 9606)]  # remove non-human rows
ncbi_gene.replace('-', 'None', inplace=True)
ncbi_gene.rename(columns={'GeneID': 'entrez_id', 'Symbol': 'symbol', 'Synonyms': 'synonyms'}, inplace=True)
ncbi_gene['synonyms'] = ncbi_gene['synonyms'] + '|' + ncbi_gene['description'] + '|' + ncbi_gene['Full_name_from_nomenclature_authority'] + '|' + ncbi_gene['Other_designations']
ncbi_gene['symbol'] = ncbi_gene['Symbol_from_nomenclature_authority'] + '|' + ncbi_gene['symbol']
ncbi_gene['name'] = ncbi_gene['Full_name_from_nomenclature_authority'] + '|' + ncbi_gene['description']

# explode nested data
explode_df_ncbi_gene = explodes_data(ncbi_gene.copy(), ['symbol', 'synonyms', 'name', 'dbXrefs'], '|')

# clean up results
explode_df_ncbi_gene['entrez_id'] = explode_df_ncbi_gene['entrez_id'].astype(str)
explode_df_ncbi_gene = explode_df_ncbi_gene.loc[explode_df_ncbi_gene['dbXrefs'].apply(lambda x: x.split(':')[0] in ['Ensembl', 'HGNC', 'IMGT/GENE-DB'])]
explode_df_ncbi_gene['hgnc_id'] = explode_df_ncbi_gene['dbXrefs'].loc[explode_df_ncbi_gene['dbXrefs'].apply(lambda x: x.startswith('HGNC'))]
explode_df_ncbi_gene['ensembl_gene_id'] = explode_df_ncbi_gene['dbXrefs'].loc[explode_df_ncbi_gene['dbXrefs'].apply(lambda x: x.startswith('Ensembl'))]
explode_df_ncbi_gene.fillna('None', inplace=True)

# reformat entrez gene type
explode_df_ncbi_gene['entrez_gene_type'] = explode_df_ncbi_gene['type_of_gene']
gene_dict = genomic_type_mapper['entrez_gene_type']
for val in gene_dict.keys(): explode_df_ncbi_gene['entrez_gene_type'].replace(val, gene_dict[val], inplace=True)
# reformat master gene type
explode_df_ncbi_gene['master_gene_type'] = explode_df_ncbi_gene['entrez_gene_type']
gene_dict = genomic_type_mapper['master_gene_type']
for val in gene_dict.keys(): explode_df_ncbi_gene['master_gene_type'].replace(val, gene_dict[val], inplace=True)

# post-process reformatted data
explode_df_ncbi_gene.drop(['type_of_gene', 'dbXrefs', 'description', 'Nomenclature_status', 'Modification_date',
                           'LocusTag', '#tax_id', 'Full_name_from_nomenclature_authority', 'Feature_type',
                           'Symbol_from_nomenclature_authority'], axis=1, inplace=True)
explode_df_ncbi_gene['hgnc_id'] = explode_df_ncbi_gene['hgnc_id'].replace('HGNC:', '', regex=True)
explode_df_ncbi_gene['ensembl_gene_id'] = explode_df_ncbi_gene['ensembl_gene_id'].replace('Ensembl:', '', regex=True)
explode_df_ncbi_gene.drop_duplicates(subset=None, keep='first', inplace=True)

# preview data
explode_df_ncbi_gene.head(n=3)

In [None]:
#url = 'https://proconsortium.org/download/current/promapping.txt'
#data_downloader(url, unprocessed_data_location)

pro_map = pd.read_csv(unprocessed_data_location + 'promapping.txt', header=None, names=['pro_id', 'entry', 'pro_mapping'], delimiter='\t')
pro_map

In [None]:
pro_map = pro_map.loc[pro_map['entry'].apply(lambda x: x.startswith('Uni') and '_VAR' not in x and ', ' not in x)]  # keep 'UniProtKB' rows
pro_map = pro_map.loc[pro_map['pro_mapping'].apply(lambda x: x.startswith('exact'))] # keep exact mappings
pro_map['pro_id'].replace('PR:','PR_', inplace=True, regex=True)  # replace PR: with PR_
pro_map['entry'].replace('(^\w*\:)','', inplace=True, regex=True)  # remove id prefixes
pro_map = pro_map.loc[pro_map['pro_id'].apply(lambda x: '-' not in x)] # remove isoforms
pro_map.rename(columns={'entry': 'uniprot_id'}, inplace=True)  # rename columns before merging
pro_map.drop(['pro_mapping'], axis=1, inplace=True)  # remove uneeded columns
pro_map.drop_duplicates(subset=None, keep='first', inplace=True)

pro_map.head(n=3)

In [None]:
merge_cols = list(set(explode_df_hgnc.columns).intersection(set(ensembl.columns)))
ensembl_hgnc_merged_data = pd.merge(ensembl, explode_df_hgnc, on=merge_cols, how='outer')

ensembl_hgnc_merged_data.fillna('None', inplace=True)
ensembl_hgnc_merged_data.drop_duplicates(subset=None, keep='first', inplace=True)

ensembl_hgnc_merged_data.head(n=3)

In [None]:
merge_cols = list(set(ensembl_hgnc_merged_data.columns).intersection(set(explode_df_uniprot.columns)))
ensembl_hgnc_uniprot_merged_data = pd.merge(ensembl_hgnc_merged_data, explode_df_uniprot, on=merge_cols, how='outer')

# clean up merged data
ensembl_hgnc_uniprot_merged_data.fillna('None', inplace=True)
ensembl_hgnc_uniprot_merged_data.drop_duplicates(subset=None, keep='first', inplace=True)

# preview data
ensembl_hgnc_uniprot_merged_data.head(n=3)

In [None]:
merge_cols = merge_cols = list(set(ensembl_hgnc_uniprot_merged_data).intersection(set(explode_df_ncbi_gene.columns)))
ensembl_hgnc_uniprot_ncbi_merged_data = pd.merge(ensembl_hgnc_uniprot_merged_data, explode_df_ncbi_gene, on=merge_cols, how='outer')

# clean up merged data
ensembl_hgnc_uniprot_ncbi_merged_data.fillna('None', inplace=True)
ensembl_hgnc_uniprot_ncbi_merged_data.drop_duplicates(subset=None, keep='first', inplace=True)

# preview data
ensembl_hgnc_uniprot_ncbi_merged_data.head(n=3)

In [None]:
merged_data = pd.merge(ensembl_hgnc_uniprot_ncbi_merged_data, pro_map, on='uniprot_id', how='outer')

# clean up merged data
merged_data.fillna('None', inplace=True)
merged_data.drop_duplicates(subset=None, keep='first', inplace=True)

# preview data
merged_data.head(n=3)

In [None]:
clean_dates = []
for x in tqdm(list(merged_data['symbol'])):
    if '-' in x and len(x.split('-')[0]) < 3 and len(x.split('-')[1]) == 3:
        clean_dates.append(x.split('-')[1].upper() + x.split('-')[0])
    else: clean_dates.append(x)

# add cleaned date var back to data set
merged_data['symbol'] = clean_dates
merged_data.fillna('None', inplace=True)

# make sure that all gene and transcript type colunmns have none recoded to unknown or not protein-coding
merged_data['hgnc_gene_type'].replace('None', 'unknown', inplace=True, regex=False)
merged_data['ensembl_gene_type'].replace('None', 'unknown', inplace=True, regex=False)
merged_data['entrez_gene_type'].replace('None', 'unknown', inplace=True, regex=False)
merged_data['master_gene_type'].replace('None', 'unknown', inplace=True, regex=False)
merged_data['master_transcript_type'].replace('None', 'not protein-coding', inplace=True, regex=False)
merged_data['ensembl_transcript_type'].replace('None', 'unknown', inplace=True, regex=False)

# remove duplicates
merged_data_clean = merged_data.drop_duplicates(subset=None, keep='first')

# write data
merged_data_clean.to_csv(processed_data_location + 'Merged_Human_Ensembl_Entrez_HGNC_Uniprot_Identifiers.txt', header=True, sep='\t', index=False)
    
# preview data
merged_data_clean.head(n=3)

In [None]:
# reformat data to convert all nones, empty values, and unknowns to NaN
for col in merged_data_clean.columns:
    merged_data_clean[col] = merged_data_clean[col].apply(lambda x: '|'.join([i for i in x.split('|') if i != 'None']))
merged_data_clean.replace(to_replace=['None', '', 'unknown'], value=numpy.nan, inplace=True)
identifiers = [x for x in merged_data_clean.columns if x.endswith('_id')] + ['symbol']

In [None]:
# convert data to dictionary
master_dict = {}
for idx in tqdm(identifiers):
    grouped_data = merged_data_clean.groupby(idx)
    grp_ids = set([x for x in list(grouped_data.groups.keys()) if x != numpy.nan])
    for grp in grp_ids:
        df = grouped_data.get_group(grp).dropna(axis=1, how='all')
        df_cols, key = df.columns, idx + '_' + grp
        val_df = [[col + '_' + x for x in set(df[col]) if isinstance(x, str)] for col in df_cols if col != idx]
        if len(val_df) > 0:
            if key in master_dict.keys(): master_dict[key] += [i for j in val_df for i in j if len(i) > 0]
            else: master_dict[key] = [i for j in val_df for i in j if len(i) > 0]  

In [None]:
reformatted_mapped_identifiers = dict()
for key, values in tqdm(master_dict.items()):
    identifier_info = set(values); gene_prefix = 'master_gene_type_'; trans_prefix = 'master_transcript_type_'
    if key.split('_')[0] in ['protein', 'uniprot', 'pro']: pass
    elif 'transcript' in key:
        trans_match = [x.replace(trans_prefix, '') for x in values if trans_prefix in x]
        if len(trans_match) > 0:
            t_type_list = ['protein-coding' if ('protein-coding' in trans_match or 'protein_coding' in trans_match) else 'not protein-coding']
            identifier_info |= {'transcript_type_update_' + max(set(t_type_list), key=t_type_list.count)}
    else:
        gene_match = [x.replace(gene_prefix, '') for x in values if x.startswith(gene_prefix) and 'type' in x]
        if len(gene_match) > 0:
            g_type_list = ['protein-coding' if ('protein-coding' in gene_match or 'protein_coding' in gene_match) else 'not protein-coding']
            identifier_info |= {'gene_type_update_' + max(set(g_type_list), key=g_type_list.count)}
    reformatted_mapped_identifiers[key] = identifier_info

# save a copy of the dictionary
# output > 4GB requires special approach: https://stackoverflow.com/questions/42653386/does-pickle-randomly-fail-with-oserror-on-large-files
filepath = processed_data_location + 'Merged_gene_rna_protein_identifiers.pkl'

# defensive way to write pickle.write, allowing for very large files on all platforms
max_bytes, bytes_out = 2**31 - 1, pickle.dumps(reformatted_mapped_identifiers)
n_bytes = sys.getsizeof(bytes_out)

with open(filepath, 'wb') as f_out:
    for idx in range(0, n_bytes, max_bytes):
        f_out.write(bytes_out[idx:idx+max_bytes])

In [None]:
# # load data
filepath = processed_data_location + 'Merged_gene_rna_protein_identifiers.pkl'

# # defensive way to write pickle.load, allowing for very large files on all platforms
max_bytes = 2**31 - 1
input_size = os.path.getsize(filepath)
bytes_in = bytearray(0)

with open(filepath, 'rb') as f_in:
     for _ in range(0, input_size, max_bytes):
            bytes_in += f_in.read(max_bytes)

# # load ickled data
reformatted_mapped_identifiers = pickle.loads(bytes_in)
reformatted_mapped_identifiers

In [None]:
df = merged_data_clean[~merged_data_clean['entrez_id'].isna()].drop(columns=['transcript_stable_id','transcript_name','ensembl_gene_type','ensembl_transcript_type','master_transcript_type','protein_stable_id','hgnc_gene_type','entrez_gene_type']).drop_duplicates()
df = df.drop_duplicates(subset=['entrez_id'], keep='first')
df

In [None]:
df.drop_duplicates().to_csv(properties_location + 'gene.csv', index=None)

***
# OBO terms
***

***
# RO

In [None]:
from rdflib import Graph

ro_graph = Graph()
ro_graph.parse(ontology_data_location + 'ro_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(ro_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')

# get ontology information
cls = [x for x in gets_ontology_classes(ro_graph)] #+\
      #[x for x in gets_object_properties(ro_graph)]
master_synonyms = [x for x in ro_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in ro_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0]) if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in ro_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym
    }
    
pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'RO.csv')

***
# HPO

In [None]:
from rdflib import Graph

hpo_graph = Graph()
hpo_graph.parse(ontology_data_location + 'hp_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(hpo_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')
dbxref_uri = URIRef("http://www.geneontology.org/formats/oboInOwl#hasDbXref")

# get ontology information
cls = [x for x in gets_ontology_classes(hpo_graph)] #+\
      #[x for x in gets_object_properties(hpo_graph)]
master_synonyms = [x for x in hpo_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in ro_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0])  if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in ro_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # DbXref
    cls_ed = [x for x in hpo_graph.objects(x, dbxref_uri) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = '|'.join([str(c) for c in cls_ed]) if len(cls_ed) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym, 'DbXref': desc_ed
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'HPO.csv')

***
# GO

In [None]:
from rdflib import Graph

go_graph = Graph()
go_graph.parse(ontology_data_location + 'go_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(go_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')
hasOBONamespace = URIRef("http://www.geneontology.org/formats/oboInOwl#hasOBONamespace")

# get ontology information
cls = [x for x in gets_ontology_classes(go_graph)] #+\
      #[x for x in gets_object_properties(go_graph)]
master_synonyms = [x for x in go_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in go_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0]) if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(cls_syn)]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in ro_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # vocabulary(MF/BP/CC)
    cls_ed = [x for x in go_graph.objects(x, hasOBONamespace) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = str(cls_ed[0]) if len(cls_ed) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym': synonym, 'Vocabulary(MF/BP/CC)': desc_ed
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'GO.csv')

***
# Mondo

In [None]:
from rdflib import Graph

mondo_graph = Graph()
mondo_graph.parse(ontology_data_location + 'mondo_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(mondo_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')

# get ontology information
cls = [x for x in gets_ontology_classes(mondo_graph)] #+\
      #[x for x in gets_object_properties(mondo_graph)]
master_synonyms = [x for x in mondo_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in mondo_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0])  if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in mondo_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # DbXref
    cls_ed = [x for x in mondo_graph.objects(x, dbxref_uri) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = '|'.join([str(c) for c in cls_ed]) if len(cls_ed) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym, 'DbXref': desc_ed
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'Mondo.csv')

***
# VO

In [None]:
from rdflib import Graph

vo_graph = Graph()
vo_graph.parse(ontology_data_location + 'vo_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(vo_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')

# get ontology information
cls = [x for x in gets_ontology_classes(vo_graph)] #+\
      #[x for x in gets_object_properties(vo_graph)]
master_synonyms = [x for x in vo_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in vo_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0]) if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in vo_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # seeAlso
    cls_seeAlso = [x for x in vo_graph.objects(x, RDFS.seeAlso) if '@' not in n3(x) or '@en' in n3(x)]
    seeAlsos = str(cls_seeAlso[0]) if len(cls_seeAlso) > 0 else 'None'
    # editor notes
    cls_ed = [x for x in vo_graph.objects(x, obo.IAO_0000116) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = '|'.join([str(cls_ed[0])]) if len(cls_ed) > 0 else 'None'
    # vaccine proper name
    cls_pn = [x for x in vo_graph.objects(x, obo.VO_0003158) if '@' not in n3(x) or '@en' in n3(x)]
    desc_pn = str(cls_pn[0]) if len(cls_pn) > 0 else 'None'
    # definition source
    cls_ds = [x for x in vo_graph.objects(x, obo.IAO_0000119) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ds = str(cls_ds[0]) if len(cls_ds) > 0 else 'None'  
    # alternative label
    cls_al = [x for x in vo_graph.objects(x, obo.IAO_0000118) if '@' not in n3(x) or '@en' in n3(x)]
    desc_al = str(cls_al[0]) if len(cls_al) > 0 else 'None' 
    # FDA indications
    cls_fi = [x for x in vo_graph.objects(x, obo.VO_0003160) if '@' not in n3(x) or '@en' in n3(x)]
    desc_fi = str(cls_fi[0]) if len(cls_fi) > 0 else 'None' 
    # trade name
    cls_td = [x for x in vo_graph.objects(x, obo.VO_0003099) if '@' not in n3(x) or '@en' in n3(x)]
    desc_td = str(cls_td[0]) if len(cls_td) > 0 else 'None'
    # example of usage
    cls_eu = [x for x in vo_graph.objects(x, obo.IAO_0000112) if '@' not in n3(x) or '@en' in n3(x)]
    desc_eu = str(cls_eu[0]) if len(cls_eu) > 0 else 'None'
    # vaccine STN
    cls_stn = [x for x in vo_graph.objects(x, obo.VO_0003162) if '@' not in n3(x) or '@en' in n3(x)]
    desc_stn = str(cls_stn[0]) if len(cls_stn) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'AlternativeLabel':desc_al, 'seeAlso': seeAlsos, 'TradeName': desc_td,
        'Description': desc, 'DefinitionSource': desc_ds, 'Synonym(s)': synonym, 'EditorNotes': desc_ed,
        'VaccineProperName': desc_pn, 'FDAindications': desc_fi, 'ExampleOfUsage': desc_eu,
        'vaccineSTN': desc_stn
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'VO.csv')

***
# ChEBI

In [None]:
from rdflib import Graph

chebi_graph = Graph()
chebi_graph.parse(ontology_data_location + 'chebi_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(chebi_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')
hasOBONamespace = URIRef("http://www.geneontology.org/formats/oboInOwl#hasOBONamespace")
dbxref_uri = URIRef("http://www.geneontology.org/formats/oboInOwl#hasDbXref")
iupacName = URIRef("http://purl.obolibrary.org/obo/chebi#IUPAC_NAME")
charge = URIRef("http://purl.obolibrary.org/obo/chebi/charge")
mass = URIRef("http://purl.obolibrary.org/obo/chebi/mass")
smiles = URIRef("http://purl.obolibrary.org/obo/chebi/smiles")
formula = URIRef("http://purl.obolibrary.org/obo/chebi/formula")
monoisotopicmass = URIRef("http://purl.obolibrary.org/obo/chebi/monoisotopicmass")
inchi = URIRef("http://purl.obolibrary.org/obo/chebi/inchi")
inchikey = URIRef("http://purl.obolibrary.org/obo/chebi/inchikey")

# get ontology information
cls = [x for x in gets_ontology_classes(chebi_graph)] #+\
      #[x for x in gets_object_properties(chebi_graph)]
master_synonyms = [x for x in chebi_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in chebi_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0])  if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in chebi_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # DbXref
    cls_ed = [x for x in chebi_graph.objects(x, dbxref_uri) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = '|'.join([str(c) for c in cls_ed]) if len(cls_ed) > 0 else 'None'
    # vocabulary
    cls_vo = [x for x in chebi_graph.objects(x, hasOBONamespace) if '@' not in n3(x) or '@en' in n3(x)]
    desc_vo = str(cls_vo[0]) if len(cls_vo) > 0 else 'None'
    # IUPAC name
    cls_iupac = [x for x in chebi_graph.objects(x, iupacName) if '@' not in n3(x) or '@en' in n3(x)]
    desc_iupac = str(cls_iupac[0]) if len(cls_iupac) > 0 else 'None'
    # charge
    cls_ch = [x for x in chebi_graph.objects(x, charge) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ch = str(cls_ch[0]) if len(cls_ch) > 0 else 'None'
    # mass
    cls_mass = [x for x in chebi_graph.objects(x, mass) if '@' not in n3(x) or '@en' in n3(x)]
    desc_mass = str(cls_mass[0]) if len(cls_mass) > 0 else 'None'
    # smiles
    cls_smiles = [x for x in chebi_graph.objects(x, smiles) if '@' not in n3(x) or '@en' in n3(x)]
    desc_smiles = str(cls_smiles[0]) if len(cls_smiles) > 0 else 'None'
    # formula
    cls_form = [x for x in chebi_graph.objects(x, formula) if '@' not in n3(x) or '@en' in n3(x)]
    desc_form = str(cls_form[0]) if len(cls_form) > 0 else 'None'
    # monoisotopicmass
    cls_mim = [x for x in chebi_graph.objects(x, monoisotopicmass) if '@' not in n3(x) or '@en' in n3(x)]
    desc_mim = str(cls_mim[0]) if len(cls_mim) > 0 else 'None'
    # inchi
    cls_in = [x for x in chebi_graph.objects(x, inchi) if '@' not in n3(x) or '@en' in n3(x)]
    desc_in = str(cls_in[0]) if len(cls_in) > 0 else 'None'
    # inchikey
    cls_ink = [x for x in chebi_graph.objects(x, inchikey) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ink = str(cls_ink[0]) if len(cls_ink) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym, 'DbXref': desc_ed,
        'Namespace': desc_vo, 'IUPACname': desc_iupac, 'Charge': desc_ch, 'Mass': desc_mass,
        'Smiles': desc_smiles, 'Monoisotopicmass': desc_mim, 'Inchi': desc_in, 'Inchikey': desc_ink
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'ChEBI.csv')

***
# Uberon

In [None]:
from rdflib import Graph

uberon_graph = Graph()
uberon_graph.parse(ontology_data_location + 'ext_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(uberon_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')

# get ontology information
cls = [x for x in gets_ontology_classes(uberon_graph)] #+\
      #[x for x in gets_object_properties(uberon_graph)]
master_synonyms = [x for x in uberon_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in uberon_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0])  if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in uberon_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # DbXref
    cls_ed = [x for x in uberon_graph.objects(x, dbxref_uri) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = '|'.join([str(c) for c in cls_ed]) if len(cls_ed) > 0 else 'None'
    # external definition
    cls_extd = [x for x in uberon_graph.objects(x, obo.UBPROP_0000001) if '@' not in n3(x) or '@en' in n3(x)]
    desc_extd = str(cls_extd[0]) if len(cls_extd) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym,
        'DbXref': desc_ed, 'ExternalDefinition': desc_extd
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'Uberon.csv')

***
# CLO

In [None]:
from rdflib import Graph

clo_graph = Graph()
clo_graph.parse(ontology_data_location + 'clo_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(clo_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')
comment = URIRef("http://www.w3.org/2000/01/rdf-schema#comment")
seeAlso = URIRef("http://www.w3.org/2000/01/rdf-schema#seeAlso")
depictedBy = URIRef("http://xmlns.com/foaf/0.1/depicted_by")

# get ontology information
cls = [x for x in gets_ontology_classes(clo_graph)] #+\
      #[x for x in gets_object_properties(clo_graph)]
master_synonyms = [x for x in clo_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in clo_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0])  if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in clo_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # DbXref
    cls_ed = [x for x in clo_graph.objects(x, dbxref_uri) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = '|'.join([str(c) for c in cls_ed]) if len(cls_ed) > 0 else 'None'
    # comment
    cls_com = [x for x in clo_graph.objects(x, comment) if '@' not in n3(x) or '@en' in n3(x)]
    desc_com = '|'.join([str(c) for c in cls_com]) if len(cls_com) > 0 else 'None'
    # seeAlso
    cls_sa = [x for x in clo_graph.objects(x, seeAlso) if '@' not in n3(x) or '@en' in n3(x)]
    desc_sa = '|'.join([str(c) for c in cls_sa]) if len(cls_sa) > 0 else 'None'
    # depicted by
    cls_db = [x for x in clo_graph.objects(x, depictedBy) if '@' not in n3(x) or '@en' in n3(x)]
    desc_db = '|'.join([str(c) for c in cls_db]) if len(cls_db) > 0 else 'None'
    # example of usage
    cls_eou = [x for x in clo_graph.objects(x, obo.IAO_0000112) if '@' not in n3(x) or '@en' in n3(x)]
    desc_eou = str(cls_eou[0]) if len(cls_eou) > 0 else 'None'
    # definition source
    cls_ds = [x for x in clo_graph.objects(x, obo.IAO_0000119) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ds = str(cls_ds[0]) if len(cls_ds) > 0 else 'None'
    # alternative term
    cls_at = [x for x in clo_graph.objects(x, obo.IAO_0000118) if '@' not in n3(x) or '@en' in n3(x)]
    desc_at = '|'.join([str(c) for c in cls_at]) if len(cls_at) > 0 else 'None'
    # IEDB alternative term
    cls_iedb = [x for x in clo_graph.objects(x, obo.OBI_9991118) if '@' not in n3(x) or '@en' in n3(x)]
    desc_iedb = str(cls_iedb[0]) if len(cls_iedb) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym, 'DbXref': desc_ed,
        'Comment': desc_com, 'SeeAlso': desc_sa, 'DepictedBy': desc_db, 'ExampleOfUsage': desc_eou,
        'DefinitionSource': desc_ds, 'AlternativeTerm': desc_at, 'IEDBalternativeTerm': desc_iedb
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'CLO.csv')

***
# PRO

In [None]:
from rdflib import Graph

pro_graph = Graph()
pro_graph.parse(ontology_data_location + 'pr_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(pro_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')
comment = URIRef("http://www.w3.org/2000/01/rdf-schema#comment")
seeAlso = URIRef("http://www.w3.org/2000/01/rdf-schema#seeAlso")
depictedBy = URIRef("http://xmlns.com/foaf/0.1/depicted_by")
orth = URIRef("http://purl.obolibrary.org/obo/pr#PRO-short-label")
syn2 = URIRef("http://purl.obolibrary.org/obo/pr#Gene-based")

# get ontology information
cls = [x for x in gets_ontology_classes(pro_graph)] #+\
      #[x for x in gets_object_properties(pro_graph)]
master_synonyms = [x for x in pro_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in pro_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0])  if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in pro_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # DbXref
    cls_ed = [x for x in pro_graph.objects(x, dbxref_uri) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = '|'.join([str(c) for c in cls_ed]) if len(cls_ed) > 0 else 'None'
    # comment
    cls_com = [x for x in pro_graph.objects(x, comment) if '@' not in n3(x) or '@en' in n3(x)]
    desc_com = '|'.join([str(c) for c in cls_com]) if len(cls_com) > 0 else 'None'
    # unique short label for PRO terms for display purposes; based on orthology
    cls_orth = [x for x in pro_graph.objects(x, orth) if '@' not in n3(x) or '@en' in n3(x)]
    desc_orth = str(cls_orth[0]) if len(cls_orth) > 0 else 'None'
    # synonyms based on current or previous gene name, ORF name, or ordered locus name
    cls_syn2 = [x for x in pro_graph.objects(x, syn2) if '@' not in n3(x) or '@en' in n3(x)]
    desc_syn2 = '|'.join([str(c) for c in cls_syn2]) if len(cls_syn2) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym, 'DbXref': desc_ed,
        'Comment': desc_com, 'UniqueShortLabel(orthology-based)': desc_orth,
        'Synonym(s)(unusedName/ORF/locus-based)': desc_syn2,
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'PRO.csv')

***
# SO

In [None]:
so_graph = Graph()
so_graph.parse(ontology_data_location + 'so_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(so_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')

# get ontology information
cls = [x for x in gets_ontology_classes(so_graph)] #+\
      #[x for x in gets_object_properties(so_graph)]
master_synonyms = [x for x in so_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in so_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0])  if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in so_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    # DbXref
    cls_ed = [x for x in so_graph.objects(x, dbxref_uri) if '@' not in n3(x) or '@en' in n3(x)]
    desc_ed = '|'.join([str(c) for c in cls_ed]) if len(cls_ed) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym, 'DbXref': desc_ed
    }

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'SO.csv')

***
# PW

In [None]:
pw_graph = Graph()
pw_graph.parse(ontology_data_location + 'pw_with_imports.owl')

print('There are {} edges in the ontology.'.format(len(pw_graph)))

In [None]:
# get metadata
relation_metadata_dict, obo = {}, Namespace('http://purl.obolibrary.org/obo/')

# get ontology information
cls = [x for x in gets_ontology_classes(pw_graph)] #+\
      #[x for x in gets_object_properties(pw_graph)]
master_synonyms = [x for x in pw_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]

for x in tqdm(cls):
    # labels
    cls_label = [x for x in pw_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
    labels = str(cls_label[0]) if len(cls_label) > 0 else 'None'
    # synonyms
    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
    synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
    # description
    cls_desc = [x for x in pw_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
    desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
    
    relation_metadata_dict[str(x)] = {
        'Label': labels, 'Description': desc, 'Synonym(s)': synonym
    } 

pd.DataFrame(relation_metadata_dict).T

In [None]:
pd.DataFrame(relation_metadata_dict).T.drop_duplicates().to_csv(properties_location + 'PW.csv')