In [None]:
%%capture
import sys
!{sys.executable} -m pip install -r requirements.txt
sys.path.append('../')

In [None]:
# import needed libraries
import datetime
import glob
import itertools
import networkx
import numpy
import os
import pickle
import re
import requests
import tarfile
import shutil
import pandas as pd
import gffpandas.gffpandas as gffpd
import numpy as np
pd.set_option('display.max_columns', None)
import re

from collections import Counter
from functools import reduce
from rdflib import Graph, Namespace, URIRef, BNode, Literal
from rdflib.namespace import OWL, RDF, RDFS
from reactome2py import content
from tqdm import tqdm
from typing import Dict

from pkt_kg.utils import * 
from builds.ontology_cleaning import *

from typing import Tuple

In [None]:
# directory to store resources
resource_data_location = '../resources/'

# directory to use for unprocessed data
unprocessed_data_location = '../resources/processed_data/unprocessed_data/'

# directory to use for processed data
processed_data_location = '../resources/processed_data/'

# directory to write ontology data to
ontology_data_location = '../resources/ontologies/'

# directory to write edges data to
edge_data_location = '../resources/edge_data/'

# directory to write node properties to
properties_location = '../resources/property_data/'

# processed data url 
processed_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/processed_data/'

# original data url 
original_url = 'https://storage.googleapis.com/pheknowlator/current_build/data/original_data/'

# owltools location
owltools_location = '../pkt_kg/libs/owltools'

***
# RDF properties (subClassOf and type)
***

In [None]:
index_values = [URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'),
                URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')]
data = {
    'Label': ['subClassOf', 'type'],
    'Description': ['The subject is a subclass of a class.', 'The subject is an instance of a class.'],
    'Synonym(s)': ['None', 'None'],
    'Scope': ['None', 'None']
}
rdfScoAndType = pd.DataFrame(data)
rdfScoAndType.index = index_values

rdfScoAndType

***
# OBO properties
***

In [None]:
obo = Namespace('http://purl.obolibrary.org/obo/')
# scope is useless now, but we decided to keep it since in the future we plan to add non-human RNA molecules
scope = URIRef('http://www.geneontology.org/formats/oboInOwl#hasScope')

In [None]:
def extract_relation_metadata(graphName):
    print('Processing ' + graphName.upper() + '...')
    
    OBOgraph = Graph()
    OBOgraph.parse(ontology_data_location + graphName + '_with_imports.owl')
    relation_metadata_dict = {}
    
    cls = [x for x in gets_object_properties(OBOgraph)]
    master_synonyms = [x for x in OBOgraph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]
    
    for x in tqdm(cls):
        # labels
        cls_label = [x for x in OBOgraph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
        labels = str(cls_label[0]) if len(cls_label) > 0 else 'None'
        # synonyms
        cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
        synonym = '|'.join([str(c) for c in cls_syn]) if len(cls_syn) > 0 else 'None'
        # description
        cls_desc = [x for x in OBOgraph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
        desc = str(cls_desc[0]) if len(cls_desc) > 0 else 'None'
        # has scope
        cls_scope = [x for x in OBOgraph.objects(x, scope) if '@' not in n3(x) or '@en' in n3(x)]
        desc_scope = str(cls_scope[0]) if len(cls_scope) > 0 else 'None'
        
        relation_metadata_dict[str(x)] = {
            'Label': labels, 'Description': desc, 'Synonym(s)': synonym, 'Scope': desc_scope
        }
    
    df = pd.DataFrame(relation_metadata_dict).T
    return df

def merge_columns(row, col_name):
    if row[f"{col_name}_x"] != row[f"{col_name}_y"]:
        return f"{row[f'{col_name}_x']}|{row[f'{col_name}_y']}"
    else:
        return row[f"{col_name}_x"]

def merge_and_clean_data(df1, df2, columns_to_merge_and_clean):
    merged = pd.merge(df1, df2, left_index=True, right_index=True, how='outer').fillna('None')
   
    for col in columns_to_merge_and_clean:
        merged[col] = merged.apply(lambda row: merge_columns(row, col), axis=1)
    
    merged = merged[columns_to_merge_and_clean]

    for column in columns_to_merge_and_clean:
        merged[column] = merged[column].str.replace('None\||\|None', '', regex=True)

    return merged

In [None]:
ontologies = ['ro', 'hp', 'go', 'mondo', 'vo', 'chebi', 'ext', 'clo', 'pr', 'so', 'pw']

merged = None
columns_to_merge = ['Label', 'Description', 'Synonym(s)', 'Scope']

for ontology in ontologies:
    df = extract_relation_metadata(ontology)
    if merged is None:
        merged = df
    else:
        merged = merge_and_clean_data(df, merged, columns_to_merge)
    
merged = pd.concat([rdfScoAndType, merged])
merged

In [None]:
# some property in OBO has no label
merged['Label'] = merged.apply(lambda row: row.name.rpartition('/')[-1] if row['Label'] == 'None' else row['Label'], axis=1)
# rpartition has proven to be faster than partition, split and rsplit
merged

In [None]:
# manually fix some inconsistencies
list(merged[merged['Label'].str.contains('\|')].index)

In [None]:
merged.loc["http://purl.obolibrary.org/obo/BFO_0000056"]['Label'] = 'participates_in|participates in at some time'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0000096"]['Label'] = 'anterior to'

merged.loc["http://purl.obolibrary.org/obo/BFO_0000056"]['Label'] = 'participates in|participates in at some time'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0000096"]['Label'] = 'anterior to'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0000097"]['Label'] = 'distal to'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0000098"]['Label'] = 'dorsal to'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0000099"]['Label'] = 'posterior to'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0000102"]['Label'] = 'ventral to'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0000107"]['Label'] = 'deep to'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0000108"]['Label'] = 'superficial to'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0000120"]['Label'] = 'in left side of'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0000121"]['Label'] = 'in right side of'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0000122"]['Label'] = 'in posterior side of'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0000123"]['Label'] = 'in anterior side of'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0000124"]['Label'] = 'in proximal side of'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0000125"]['Label'] = 'in distal side of'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0000126"]['Label'] = 'in lateral side of'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0001106"]['Label'] = 'proximalmost part of'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0001107"]['Label'] = 'immediately deep to'
merged.loc["http://purl.obolibrary.org/obo/BSPO_0001108"]['Label'] = 'distalmost part of'
merged.loc["http://purl.obolibrary.org/obo/GOREL_0002004"]['Label'] = 'results in fission of'
merged.loc["http://purl.obolibrary.org/obo/RO_0001019"]['Label'] = 'contains'
merged.loc["http://purl.obolibrary.org/obo/RO_0002158"]['Label'] = 'homologous to|shares ancestor with'

merged.loc["http://purl.obolibrary.org/obo/BFO_0000050"]['Label'] = 'part of'
merged.loc["http://purl.obolibrary.org/obo/BFO_0000051"]['Label'] = 'has part'
merged.loc["http://purl.obolibrary.org/obo/RO_0000052"]['Label'] = 'inheres in|characteristic of'
merged.loc["http://purl.obolibrary.org/obo/RO_0000053"]['Label'] = 'bearer of|has characteristic'
merged.loc["http://purl.obolibrary.org/obo/RO_0000086"]['Label'] = 'has quality'
merged.loc["http://purl.obolibrary.org/obo/RO_0001000"]['Label'] = 'derives from'
merged.loc["http://purl.obolibrary.org/obo/RO_0001025"]['Label'] = 'located in'
merged.loc["http://purl.obolibrary.org/obo/RO_0002091"]['Label'] = 'starts during'
merged.loc["http://purl.obolibrary.org/obo/RO_0002093"]['Label'] = 'ends during'
merged.loc["http://purl.obolibrary.org/obo/RO_0002160"]['Label'] = 'only in taxon'
merged.loc["http://purl.obolibrary.org/obo/RO_0002180"]['Label'] = 'has component'
merged.loc["http://purl.obolibrary.org/obo/RO_0002215"]['Label'] = 'capable of'
merged.loc["http://purl.obolibrary.org/obo/RO_0002331"]['Label'] = 'involved in'
merged.loc["http://purl.obolibrary.org/obo/RO_0002353"]['Label'] = 'output of'
merged.loc["http://purl.obolibrary.org/obo/RO_0002448"]['Label'] = 'directly regulates activity of|activity directly regulates activity of'
merged.loc["http://purl.obolibrary.org/obo/RO_0002449"]['Label'] = 'directly negatively regulates activity of|activity directly negatively regulates activity of'
merged.loc["http://purl.obolibrary.org/obo/RO_0002450"]['Label'] = 'directly positively regulates activity of|activity directly positively regulates activity of'
merged.loc["http://purl.obolibrary.org/obo/RO_0002506"]['Label'] = 'causal relation between entities|causal relation between material entities'
merged.loc["http://purl.obolibrary.org/obo/RO_0004021"]['Label'] = 'disease has basis in disruption of|disease caused by disruption of'
merged.loc["http://purl.obolibrary.org/obo/RO_0004022"]['Label'] = 'disease has basis in feature|disease arises from feature'

In [None]:
merged[['Label','Description','Synonym(s)']].drop_duplicates().to_csv(properties_location + 'EdgeTypes.csv')