Merge pull request #64 from ChalkLab/63-iterate-function-branch

Fix for the iterate function in SciData object
chalklab · Jun 15, 2021 · 6a35ae7 · 6a35ae7
2 parents 2104273 + 6a56efd
commit 6a35ae7
Show file tree

Hide file tree

Showing 5 changed files with 204 additions and 26 deletions.
diff --git a/example.py b/example.py
@@ -2,7 +2,6 @@
 from scidatalib.scidata import SciData
 import json
 import pandas as pd
-import numpy as np
 
 uid = 'example'
 example = SciData(uid)
@@ -193,7 +192,69 @@
 
 dps = [pnt1, pnt2]
 
-dps = [{"@id": "datapoint", "@type": "sdo:datapoint", "activity_id": 16464576, "assay": "CHEMBL3767769", "data": [{"type": "IC50", "@id": "datum", "@type": "sdo:exptdata", "value": {"relation": "=", "@id": "value", "@type": "sdo:value", "value": "19.000000000000000000000000000000", "units": "uM"}}, {"@id": "datum", "@type": "sdo:deriveddata", "value": {"standard_relation": "=", "@id": "value", "@type": "sdo:value", "standard_value": "19000.000000000000000000000000000000", "standard_units": "nM", "standard_type": "IC50", "pchembl_value": "4.72", "uo_units": "obo:UO_0000065", "qudt_units": "qudt:NanoMOL-PER-L"}}, {"@id": "datum", "@type": "sdo:None", "value": {"standard_flag": "1", "@id": "value", "@type": "sdo:value", "activity_id": "16464576"}}]}, {"@id": "datapoint", "annotation": "gb:P04524", "conditions": "Observation", "value": {"@id": "textvalue", "text": "The solution was clear, no reagent precipitation was observed.", "textype": "plain", "language": "en-us"}}]
+dp1_datum1 = {
+    "@id": "datum",
+    "@type": "sdo:exptdata",
+    "type": "IC50",
+    "value": {
+        "@id": "value",
+        "@type": "sdo:value",
+        "relation": "=",
+        "units": "uM",
+        "value": "19.000000000000000000000000000000"
+    }
+}
+
+dp1_datum2 = {
+    "@id": "datum",
+    "@type": "sdo:deriveddata",
+    "value": {
+        "standard_relation": "=",
+        "@id": "value",
+        "@type": "sdo:value",
+        "standard_value": "19000.000000000000000000000000000000",
+        "standard_units": "nM",
+        "standard_type": "IC50",
+        "pchembl_value": "4.72",
+        "uo_units": "obo:UO_0000065",
+        "qudt_units": "qudt:NanoMOL-PER-L"
+    }
+}
+
+dp1_datum3 = {
+    "@id": "datum",
+    "@type": "sdo:None",
+    "value": {
+        "standard_flag": "1",
+        "@id": "value",
+        "@type": "sdo:value",
+        "activity_id": "16464576"
+    }
+}
+
+dp1 = {
+    "@id": "datapoint",
+    "@type": "sdo:datapoint",
+    "activity_id": 16464576,
+    "assay": "CHEMBL3767769",
+    "data": [dp1_datum1, dp1_datum2, dp1_datum3]
+}
+
+# Input datapoint 2
+dp2 = {
+    "@id": "datapoint",
+    "annotation": "gb:P04524",
+    "conditions": "Observation",
+    "value": {
+        "@id": "textvalue",
+        "text":
+            "The solution was clear, no reagent precipitation was observed.",
+        "textype": "plain",
+        "language": "en-us"
+    }
+}
+
+dps = [dp1, dp2]
 
 example.datapoint(dps)
 
@@ -218,10 +279,10 @@
     'values_numpy_array': str(ser1_numpy_array),
     'values_numpy_list': str(ser1_numpy_list),
     'values_numpy_json': str(ser1_numpy_json)}
-for k,v in ser1_dict.items():
-    dataser1.update({str(k):str(v)})
-for k,v in ser1_dict_str.items():
-    dataser1.update({str('str_'+k):v})
+for k, v in ser1_dict.items():
+    dataser1.update({str(k): str(v)})
+for k, v in ser1_dict_str.items():
+    dataser1.update({str('str_'+k): v})
 
 ser2_input = {'colA': [10, 20, 30]}
 ser2_dataframe = pd.DataFrame(ser2_input)
@@ -244,10 +305,10 @@
     'values_numpy_array': str(ser2_numpy_array),
     'values_numpy_list': str(ser2_numpy_list),
     'values_numpy_json': str(ser2_numpy_json)}
-for k,v in ser2_dict.items():
-    dataser2.update({str(k):str(v)})
-for k,v in ser2_dict_str.items():
-    dataser2.update({str('str_'+k):v})
+for k, v in ser2_dict.items():
+    dataser2.update({str(k): str(v)})
+for k, v in ser2_dict_str.items():
+    dataser2.update({str('str_'+k): v})
 
 example.dataseries([dataser1, dataser2])
 

diff --git a/scidatalib/io/rruff.py b/scidatalib/io/rruff.py
@@ -135,7 +135,7 @@ def _read_get_facets_section(rruff_dict: dict) -> dict:
     facets = []
     material = {
         "@id": "material",
-        "@type": ["sdo:facet", "sdo:material"],
+        "@type": "sdo:material",
         "name": rruff_dict.get("names", ""),
         "materialType": rruff_dict.get("ideal chemistry", ""),
     }

diff --git a/scidatalib/scidata.py b/scidatalib/scidata.py
@@ -737,46 +737,68 @@ def __iterate_function(self, it, level, cnt_index, cat_index):
         """
         new_cat_index = cat_index.copy()
 
+        # If we simply have string, return, end recursion
         if isinstance(it, str):
             self.__addid(it)
             return it, 1, cnt_index, cat_index
 
+        # Set the category
         if '@id' in it:
             category = it['@id']
         elif 'descriptors' in it or 'identifiers' in it:
             category = 'compound'
         else:
             category = 'undefined'
 
-        count = 1
+        # Increase count if already encountered category; initialize otherwise
         if category in cnt_index:
-            count = cnt_index[category] + 1
+            cnt_index[category] += 1
+        else:
+            cnt_index[category] = 1
 
+        # Update state holding level of nesting and associated category
         new_cat_index.update({level: category})
 
+        # Set the @id and @type based on the category and count
         uid = ''
         for cat in list(new_cat_index.values()):
-            uid += cat + '/' + str(count) + '/'
+            uid += cat + '/' + str(cnt_index[cat]) + '/'
+
+        # Loop over non-id or non-type entries to recursively process
+        # sub-elements of the list of objects
         temp: dict = {'@id': uid, '@type': 'sdo:' + category}
-        for k in it.keys():
-            if k != '@id':
-                if isinstance(it[k], list):
+        for key, value in it.items():
+
+            # Already constructed the @id, so iterate only on non-@id entries
+            if key != '@id':
+                count = cnt_index[category]
+
+                # For list, recusively process elements in sub-list
+                if isinstance(value, list):
                     level += 1
-                    for i, y in enumerate(it[k]):
-                        it[k][i], category, count, new_cat_index = \
+                    for i, y in enumerate(value):
+                        value[i], category, count, new_cat_index = \
                             self.__iterate_function(
                                 y, level, cnt_index, new_cat_index)
-                    temp[k] = it[k]
+                    temp[key] = value
                     level -= 1
-                elif isinstance(it[k], dict):
+
+                # For list, recusively process key-values in sub-dict
+                elif isinstance(value, dict):
                     level += 1
-                    temp[k], category, count, new_cat_index = \
+                    temp[key], category, count, new_cat_index = \
                         self.__iterate_function(
-                            it[k], level, cnt_index, new_cat_index)
+                            value, level, cnt_index, new_cat_index)
                     level -= 1
+
+                # Simply add the value to list of objects to return
                 else:
-                    temp[k] = it[k]
-                    self.__addid(it[k])
+                    temp[key] = value
+                    self.__addid(value)
+
+        # Remove the last added "leaf" level to trim the @id value correctly
+        new_cat_index.pop(level)
+        cnt_index[category] = 0
         return temp, category, count, new_cat_index
 
     @property

diff --git a/tests/io/test_rruff.py b/tests/io/test_rruff.py
@@ -73,7 +73,7 @@ def test_read_rruff(raman_soddyite_file):
     assert len(system["facets"]) == 1
     facet = system["facets"][0]
     assert facet["@id"] == "material/1/"
-    assert len(facet["@type"]) == 2
+    assert facet["@type"] == "sdo:material"
     assert facet["materialType"] == "(UO_2_)_2_SiO_4_&#183;2H_2_O"
     assert facet["name"] == "Soddyite"
 

diff --git a/tests/test_scidata.py b/tests/test_scidata.py
@@ -1,4 +1,5 @@
 """pytest test class for scidata.py"""
+import copy
 from scidatalib.scidata import SciData
 from datetime import datetime
 import pytest
@@ -203,6 +204,100 @@ def test_datapoint(sd):
     assert sd.datapoint([pnt]) == [out]
 
 
+def test_datapoint_nested(sd):
+    """Test multiple, nested datum in datapoints for correct enumeration"""
+    sd.namespaces({'gb': 'https://goldbook.iupac.org/terms/view/'})
+
+    # Input datapoint 1
+    dp1_datum1 = {
+        "@id": "datum",
+        "@type": "sdo:exptdata",
+        "type": "IC50",
+        "value": {
+            "@id": "value",
+            "@type": "sdo:value",
+            "relation": "=",
+            "units": "uM",
+            "value": "19.000000000000000000000000000000"
+        }
+    }
+
+    dp1_datum2 = {
+        "@id": "datum",
+        "@type": "sdo:deriveddata",
+        "value": {
+            "standard_relation": "=",
+            "@id": "value",
+            "@type": "sdo:value",
+            "standard_value": "19000.000000000000000000000000000000",
+            "standard_units": "nM",
+            "standard_type": "IC50",
+            "pchembl_value": "4.72",
+            "uo_units": "obo:UO_0000065",
+            "qudt_units": "qudt:NanoMOL-PER-L"
+        }
+    }
+
+    dp1_datum3 = {
+        "@id": "datum",
+        "@type": "sdo:None",
+        "value": {
+            "standard_flag": "1",
+            "@id": "value",
+            "@type": "sdo:value",
+            "activity_id": "16464576"
+        }
+    }
+
+    dp1 = {
+        "@id": "datapoint",
+        "@type": "sdo:datapoint",
+        "activity_id": 16464576,
+        "assay": "CHEMBL3767769",
+        "data": [dp1_datum1, dp1_datum2, dp1_datum3]
+    }
+
+    # Input datapoint 2
+    dp2 = {
+        "@id": "datapoint",
+        "@type": "sdo:datapoint",
+        "annotation": "gb:P04524",
+        "conditions": "Observation",
+        "value": {
+            "@id": "textvalue",
+            "@type": "sdo:textvalue",
+            "text":
+                "The solution was clear, no reagent precipitation was observed.", # noqa
+            "textype": "plain",
+            "language": "en-us"
+        }
+    }
+
+    # Create target output datapoints
+    out_dp1 = copy.deepcopy(dp1)
+
+    out_dp1["@id"] = "datapoint/1/"
+
+    out_dp1_datum1 = out_dp1["data"][0]
+    out_dp1_datum1["@id"] = "datapoint/1/datum/1/"
+    out_dp1_datum1["value"]["@id"] = "datapoint/1/datum/1/value/1/"
+
+    out_dp1_datum2 = out_dp1["data"][1]
+    out_dp1_datum2["@id"] = "datapoint/1/datum/2/"
+    out_dp1_datum2["value"]["@id"] = "datapoint/1/datum/2/value/1/"
+
+    out_dp1_datum3 = out_dp1["data"][2]
+    out_dp1_datum3["@id"] = "datapoint/1/datum/3/"
+    out_dp1_datum3["value"]["@id"] = "datapoint/1/datum/3/value/1/"
+
+    out_dp2 = copy.deepcopy(dp2)
+
+    out_dp2["@id"] = "datapoint/2/"
+    out_dp2["value"]["@id"] = "datapoint/2/textvalue/1/"
+
+    assert sd.datapoint([dp1, dp2]) == [out_dp1, out_dp2]
+
+
 def test_datagroup_with_datapoints(sd):
     sd.namespaces(
         {