In [1]:
pip install pronto

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pronto
from pronto import Ontology
from collections import defaultdict
import json
import gzip
from ParsingModule import transform_nested_dict_to_tree, flatten

In [3]:
def get_obo_subclasses_and_elements(onto, obo_id, obo_label,obo_elements, d=None, distance=1):
    obo_elements.add(obo_label)
    if d is None:
        d = defaultdict(dict)

    """
    Get all subclasses of a given obo_id
    :param obo_id: obo_id to search for
    :param distance: distance to search for subclasses
    :return: list of subclasses
    """
    subclasses = list(onto[obo_id].subclasses(distance=1))
    if len(subclasses) > 1:
        d[obo_label] = {}
        for i in subclasses[1:]:
            obo_id = i.id
            obo_label = i.name
            obo_elements.add(obo_label)
            d[obo_label] = get_obo_subclasses(onto, obo_id, obo_label, obo_elements, defaultdict(dict), distance=1)
    else:
        d = {}
    d = remove_duplicate_values(d)
    return d, obo_elements

def get_obo_subclasses(onto, obo_id, obo_label, d=None, distance=1):

    if d is None:
        d = defaultdict(dict)

    """
    Get all subclasses of a given obo_id
    :param obo_id: obo_id to search for
    :param distance: distance to search for subclasses
    :return: list of subclasses
    """
    subclasses = list(onto[obo_id].subclasses(distance=1))
    if len(subclasses) > 1:
        d[obo_label] = {}
        for i in subclasses[1:]:
            obo_id = i.id
            obo_label = i.name
            d[obo_label] = get_obo_subclasses(onto, obo_id, obo_label, defaultdict(dict), distance=1)
    else:
        d = {}
    d = remove_duplicate_values(d)
    return d
    
def remove_duplicate_values(d):
    for k, v in d.items():
        if isinstance(v, dict):
            remove_duplicate_values(v)
        if k in v:
            del v[k]

    return d

In [4]:
def store_as_gzipped_json(data, filename):
    """"Given a datafile to store and the filename, this function stores the data as a gzipped json file in C:\\Users\\tinec\\OneDrive - UGent\\git\\SDRF_GUI\\data"""
    path = "C:\\Users\\tinec\\OneDrive - UGent\\git\\SDRF_GUI\\data\\" + filename + ".json.gz"
    with gzip.open(path, 'wt') as f:
        json.dump(data, f)
    return(f"Stored {filename} as gzipped json")

In [5]:
def open_gzipped_json(filename):
    path = "C:\\Users\\tinec\\OneDrive - UGent\\git\\SDRF_GUI\\data" + filename + ".json.gz"
    with gzip.open(path, 'rt') as f:
        data = json.load(f)
    return(data)

# MS

'comment[cleavage agent details]', ==> MS:1001045<br>
'comment[instrument]',==>ontology MS:1000031<br>
comment[dissociation method] MS:1000044

In [None]:
ms = Ontology("C:\\Users\\tinec\\OneDrive - UGent\\git\\SDRF_GUI\\ontology\\psi-ms.obo")

## Cleavage agent details: a list, no substructure

In [None]:
cleavage_list= []
i = list(ms['MS:1001045'].subclasses())
for _ in i:
    cleavage_list.append(_.name)
cleavage_list = cleavage_list[1:]

In [None]:
cleavage_agent_dict = get_obo_subclasses(ms, 'MS:1001045', 'cleavage agent name')
cleavage_agent_dict.pop('cleavage agent name')
cleavage_agent_dict

In [None]:
cleavage_agent_nodes = transform_nested_dict_to_tree(cleavage_agent_dict)
all_cleavage_agent_elements = flatten(cleavage_agent_dict)
store_as_gzipped_json(cleavage_agent_dict, "cleavage_agent_dict")
store_as_gzipped_json(cleavage_agent_nodes, "cleavage_agent_nodes")
store_as_gzipped_json(all_cleavage_agent_elements, "all_cleavage_agent_elements")

In [None]:
store_as_gzipped_json(cleavage_list, 'cleavage_list')

## Instrument model. Substructure

In [None]:
instrument_dict = get_obo_subclasses(ms, 'MS:1000031', 'instrument model', distance=1)
instrument_dict.pop('instrument model')

In [None]:
instrument_dict

In [None]:
instrument_nodes = transform_nested_dict_to_tree(instrument_dict)
all_instrument_elements = flatten(instrument_dict)
store_as_gzipped_json(instrument_dict, "instrument_dict")
store_as_gzipped_json(instrument_nodes, "instrument_nodes")
store_as_gzipped_json(all_instrument_elements, "all_instrument_elements")

## Dissociation method:MS:1000044

In [32]:
dissociation_dict = get_obo_subclasses(ms, 'MS:1000044', 'dissociation method', distance=1)
dissociation_dict.pop('dissociation method')

{}

In [36]:
dissociation_dict['collision-induced dissociation']

collections.defaultdict

In [None]:
dissociation_nodes = transform_nested_dict_to_tree(dissociation_dict)
all_dissociation_elements = flatten(dissociation_dict)
store_as_gzipped_json(dissociation_dict, "dissociation_dict")
store_as_gzipped_json(dissociation_nodes, "dissociation_nodes")
store_as_gzipped_json(all_dissociation_elements, "all_dissociation_elements")

# PRIDE

In [None]:
# a datetime error appeared, we don't need the creation time so let's remove it
file1 = open('pride_cv.obo',
             'r')
 
# defining object file2 to
# open GeeksforGeeksUpdated file
# in write mode
file2 = open('pride_cv_updated.obo',
             'w')
 
# reading each line from original
# text file
for line in file1.readlines():
   
     # reading all lines that do not
     # begin with "TextGenerator"
    if not (line.startswith('creat')):
         
        # storing only those lines that
        # do not begin with "TextGenerator"
        file2.write(line)
 
# close and save the files
file2.close()
file1.close()

In [None]:
pride = Ontology("/home/compomics/git/Publication/lesSDRF/ontology/pride_cv_updated.obo")

if fractionated: comment[ fractionation method]PRIDE:0000550<br>
'comment[label]', ==> label free, TMT channelsPRIDE:0000514

## Fractionation method 

In [None]:
fractionation_dict = get_obo_subclasses(pride, 'PRIDE:0000550', 'Fractionation method', distance=1)
fractionation_dict
# PRIDE

In [None]:
fractionation_dict.pop('Fractionation method')

In [None]:
fractionation_nodes = transform_nested_dict_to_tree(fractionation_dict)
all_fractionation_elements = flatten(fractionation_dict)
store_as_gzipped_json(fractionation_dict, "fractionation_dict")
store_as_gzipped_json(fractionation_nodes, "fractionation_nodes")
store_as_gzipped_json(all_fractionation_elements, "all_fractionation_elements")

## Label

In [18]:
label_dict = get_obo_subclasses(pride, 'PRIDE:0000514', 'Label', distance=1)
label_dict
# PRIDE

defaultdict(dict,
            {'Label': {},
             'label free sample': {},
             'TMT': defaultdict(dict,
                         {'TMT126': {},
                          'TMT127': {},
                          'TMT127C': {},
                          'TMT127N': {},
                          'TMT128': {},
                          'TMT128C': {},
                          'TMT128N': {},
                          'TMT129': {},
                          'TMT129C': {},
                          'TMT129N': {},
                          'TMT130': {},
                          'TMT130C': {},
                          'TMT130N': {},
                          'TMT131': {},
                          'TMT131N': {},
                          'TMT131C': {},
                          'TMT132N': {},
                          'TMT132C': {},
                          'TMT133N': {},
                          'TMT133C': {},
                          'TMT134N': {}}),
             'ITRAQ': d

In [19]:
label_dict.pop('Label')

{}

In [20]:
#make a defaultdict for the heavy, intermediate and light label
label_dict["Stable isotope dimethyl labeling"] = defaultdict(dict)
label_dict["Stable isotope dimethyl labeling"]["Heavy"] = {}
label_dict["Stable isotope dimethyl labeling"]["Light"] = {}
label_dict["Stable isotope dimethyl labeling"]["Intermediate"] = {}

In [21]:
label_dict

defaultdict(dict,
            {'label free sample': {},
             'TMT': defaultdict(dict,
                         {'TMT126': {},
                          'TMT127': {},
                          'TMT127C': {},
                          'TMT127N': {},
                          'TMT128': {},
                          'TMT128C': {},
                          'TMT128N': {},
                          'TMT129': {},
                          'TMT129C': {},
                          'TMT129N': {},
                          'TMT130': {},
                          'TMT130C': {},
                          'TMT130N': {},
                          'TMT131': {},
                          'TMT131N': {},
                          'TMT131C': {},
                          'TMT132N': {},
                          'TMT132C': {},
                          'TMT133N': {},
                          'TMT133C': {},
                          'TMT134N': {}}),
             'ITRAQ': defaultdict(dict,
         

In [22]:
label_nodes = transform_nested_dict_to_tree(label_dict)
all_label_elements = flatten(label_dict)
store_as_gzipped_json(label_dict, "label_dict")
store_as_gzipped_json(label_nodes, "label_nodes")
store_as_gzipped_json(all_label_elements, "all_label_elements")

'Stored all_label_elements as gzipped json'

reduction reagent 
alkylation reagent

In [None]:
reduction_dict = get_obo_subclasses(pride, 'PRIDE:0000607', 'reduction reagent', distance=1)
reduction_dict.pop("reduction reagent")

In [None]:
reduction_nodes = transform_nested_dict_to_tree(reduction_dict)
all_reduction_elements = flatten(reduction_dict)
store_as_gzipped_json(reduction_dict, "reduction_dict")
store_as_gzipped_json(reduction_nodes, "reduction_nodes")
store_as_gzipped_json(all_reduction_elements, "all_reduction_elements")

In [None]:
alkylation_dict = get_obo_subclasses(pride, 'PRIDE:0000598', 'alkylation reagent', distance=1)
alkylation_dict.pop("alkylation reagent")
alkylation_nodes = transform_nested_dict_to_tree(alkylation_dict)
all_alkylation_elements = flatten(alkylation_dict)
store_as_gzipped_json(alkylation_dict, "alkylation_dict")
store_as_gzipped_json(alkylation_nodes, "alkylation_nodes")
store_as_gzipped_json(all_alkylation_elements, "all_alkylation_elements")

# NCBITaxon

In [5]:
ncbi = Ontology("C:\\Users\\tinec\\OneDrive - UGent\\git\\SDRF_GUI\\ontology\\ncbitaxon.obo")

In [6]:
# find all end leafs in the obo tree and store them in a list
def get_obo_leafs(obo, root, root_name):
    leafs = []
    i = list(obo[root].subclasses())
    for _ in i:
        if len(_.subclasses()) == 0:
            leafs.append(_.name)
        else:
            leafs.extend(get_obo_leafs(obo, _, _.name))
    return leafs

In [12]:
list(ncbi['NCBITaxon:1'].subclasses())

KeyboardInterrupt: 

In [7]:
leafs_list = get_obo_leafs(ncbi, 'NCBITaxon:1', 'root')

TypeError: object of type 'SubclassesHandler' has no len()

In [None]:
root_dict = get_obo_subclasses(ncbi, 'NCBITaxon:1', 'root', distance=1)

In [None]:
taxonomy_nodes = transform_nested_dict_to_tree(root_dict)
all_taxonomy_elements = flatten(root_dict)
store_as_gzipped_json(root_dict, "taxonomy_dict")
store_as_gzipped_json(taxonomy_nodes, "taxonomy_nodes")
store_as_gzipped_json(all_taxonomy_elements, "all_taxonomy_elements")

Very large dict, causes too long waiting times ==> split in different taxa?

In [8]:
virus_dict = get_obo_subclasses(ncbi, 'NCBITaxon:10239', 'Viruses', distance=1)

In [9]:
archaea_dict = get_obo_subclasses(ncbi, 'NCBITaxon:2157', 'Archaea', distance=1)
bacteria_dict = get_obo_subclasses(ncbi, 'NCBITaxon:2', 'Bacteria', distance=1)
other_sequences_dict = get_obo_subclasses(ncbi, 'NCBITaxon:28384', 'other sequences', distance=1)
unclassified_dict = get_obo_subclasses(ncbi, 'NCBITaxon:12908', 'unclassified entries', distance=1)

In [6]:
eukaryota_dict = get_obo_subclasses(ncbi, 'NCBITaxon:2759', 'Eukaryota', distance=1)

In [42]:
sp = get_obo_subclasses(ncbi, "NCBITaxon:88918", "Trimeniaceae", distance=1 )

In [43]:
sp
# count unique elements in nested dictionary
def count_unique_elements(nested_dict):
    unique_elements = []
    for key, value in nested_dict.items():
        if isinstance(value, dict):
            unique_elements.append(key)
            unique_elements.extend(count_unique_elements(value))
        else:
            unique_elements.append(key)
    return list(set(unique_elements))
sp_elem = count_unique_elements(sp)

In [57]:
eukaryota_dict.keys()

dict_keys(['Eukaryota', 'Breviatea', 'Hemimastigophora', 'Rhodelphea', 'Haptista', 'CRuMs', 'Metamonada', 'Discoba', 'Eukaryota incertae sedis', 'Ancyromonadida', 'Sar', 'Rhodophyta', 'Malawimonadida', 'Provora', 'Cryptophyceae', 'Viridiplantae', 'Opisthokonta', 'Glaucocystophyceae', 'unclassified eukaryotes', 'Apusozoa', 'Amoebozoa', 'environmental samples <eukaryotes,superkingdom Eukaryota>', ('environmental samples <eukaryotes,superkingdom Eukaryota>',), 'environmental samples'])

In [7]:
print(len(euk_elem))

1679405


In [10]:
archaea_dict.pop('Archaea')
bacteria_dict.pop('Bacteria')
eukaryota_dict.pop('Eukaryota')
other_sequences_dict.pop('other sequences')
unclassified_dict.pop('unclassified entries')
virus_dict.pop('Viruses')

NameError: name 'eukaryota_dict' is not defined

In [1]:
taxonomies = [virus_dict, archaea_dict, bacteria_dict, eukaryota_dict, other_sequences_dict, unclassified_dict]	
names = ["virus", "archaea", "bacteria", "eukaryota", "other_sequences", "unclassified"]
for i, name in zip(taxonomies, names):
    nodes = transform_nested_dict_to_tree(i)
    elements = flatten(i)
    store_as_gzipped_json(i, f"{name}_dict")
    store_as_gzipped_json(nodes, f"{name}_nodes")
    store_as_gzipped_json(set(elements), f"all_{name}_elements")
    print(f"{name} done")

NameError: name 'virus_dict' is not defined

In [8]:
eu_elem = flatten(eukaryota_dict)

TypeError: unhashable type: 'collections.defaultdict'

In [78]:
from sys import getsizeof

In [10]:
getsizeof(euk_elem)

67109080

In [80]:
getsizeof(species_list)

13533400

In [51]:
getsizeof(eukaryota_dict)

1184

In [20]:
getsizeof(eu_elem) - getsizeof(set(eu_elem))

-53673784

In [21]:
getsizeof(eu_elem) - getsizeof(eukaryota_dict)

13434112

In [65]:
import networkx
import obonet

graph = obonet.read_obo("C:\\Users\\tinec\\OneDrive - UGent\\git\\SDRF_GUI\\ontology\\ncbitaxon.obo")

In [67]:
print(f"There are {len(graph)} nodes") #number of nodes
print(f"There are {graph.number_of_edges()} edges") #number of edges

There are 2490774 nodes
There are 2490772 edges


In [68]:
networkx.is_directed_acyclic_graph(graph)

True

In [70]:
graph.nodes["NCBITaxon:1639121"]

{'name': 'Haemoproteidae',
 'namespace': 'ncbi_taxonomy',
 'xref': ['GC_ID:1'],
 'is_a': ['NCBITaxon:5819'],
 'property_value': ['has_rank NCBITaxon:family']}

In [82]:
eukaryota_list = []
for i in networkx.ancestors(graph,"NCBITaxon:2759"):
   eukaryota_list.append(graph.nodes[i]["name"])

virus_list = []
for i in networkx.ancestors(graph,"NCBITaxon:10239"):
   virus_list.append(graph.nodes[i]["name"])

bacteria_list = []
for i in networkx.ancestors(graph,"NCBITaxon:2"):
   bacteria_list.append(graph.nodes[i]["name"])

In [83]:
print(len(virus_list), len(bacteria_list), len(eukaryota_list))

236153 541922 1679404


In [79]:
getsizeof(species_list)

13533400

In [81]:
len(euk_elem)

1679405

# CL

In [19]:
cl = Ontology("/home/compomics/git/Publication/lesSDRF/ontology/cl-basic.obo", )

In [20]:
cell_list= []
i = list(cl['CL:0000000'].subclasses())
for _ in i:
    cell_list.append(_.name)
cell_list = cell_list[1:]

In [25]:
cell_dict = get_obo_subclasses(cl, 'CL:0000000', 'cell')
cell_dict.pop('cell')
cell_dict

defaultdict(dict,
            {'native cell': defaultdict(dict,
                         {'germ line cell': defaultdict(dict,
                                      {'germ line stem cell': defaultdict(dict,
                                                   {'male germ line stem cell': {},
                                                    'female germ line stem cell': {}}),
                                       'germ cell': defaultdict(dict,
                                                   {'male germ cell': defaultdict(dict,
                                                                {'male germ line stem cell': {},
                                                                 'spermatocyte': defaultdict(dict,
                                                                             {'primary spermatocyte': {},
                                                                              'secondary spermatocyte': {}}),
                                                    

In [26]:
cell_nodes = transform_nested_dict_to_tree(cell_dict)
all_cell_elements = flatten(cell_dict)
store_as_gzipped_json(cell_dict, "cell_dict")
store_as_gzipped_json(cell_nodes, "cell_nodes")
store_as_gzipped_json(all_cell_elements, "all_cell_elements")

'Stored all_cell_elements as gzipped json'

In [27]:
store_as_gzipped_json(cell_list, 'cell_list')

'Stored cell_list as gzipped json'