Skip to content

Commit

Permalink
Merge pull request #64 from ChalkLab/63-iterate-function-branch
Browse files Browse the repository at this point in the history
Fix for the iterate function in SciData object
  • Loading branch information
marshallmcdonnell committed Jun 15, 2021
2 parents 2104273 + 6a56efd commit 6a35ae7
Show file tree
Hide file tree
Showing 5 changed files with 204 additions and 26 deletions.
81 changes: 71 additions & 10 deletions example.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
from scidatalib.scidata import SciData
import json
import pandas as pd
import numpy as np

uid = 'example'
example = SciData(uid)
Expand Down Expand Up @@ -193,7 +192,69 @@

dps = [pnt1, pnt2]

dps = [{"@id": "datapoint", "@type": "sdo:datapoint", "activity_id": 16464576, "assay": "CHEMBL3767769", "data": [{"type": "IC50", "@id": "datum", "@type": "sdo:exptdata", "value": {"relation": "=", "@id": "value", "@type": "sdo:value", "value": "19.000000000000000000000000000000", "units": "uM"}}, {"@id": "datum", "@type": "sdo:deriveddata", "value": {"standard_relation": "=", "@id": "value", "@type": "sdo:value", "standard_value": "19000.000000000000000000000000000000", "standard_units": "nM", "standard_type": "IC50", "pchembl_value": "4.72", "uo_units": "obo:UO_0000065", "qudt_units": "qudt:NanoMOL-PER-L"}}, {"@id": "datum", "@type": "sdo:None", "value": {"standard_flag": "1", "@id": "value", "@type": "sdo:value", "activity_id": "16464576"}}]}, {"@id": "datapoint", "annotation": "gb:P04524", "conditions": "Observation", "value": {"@id": "textvalue", "text": "The solution was clear, no reagent precipitation was observed.", "textype": "plain", "language": "en-us"}}]
dp1_datum1 = {
"@id": "datum",
"@type": "sdo:exptdata",
"type": "IC50",
"value": {
"@id": "value",
"@type": "sdo:value",
"relation": "=",
"units": "uM",
"value": "19.000000000000000000000000000000"
}
}

dp1_datum2 = {
"@id": "datum",
"@type": "sdo:deriveddata",
"value": {
"standard_relation": "=",
"@id": "value",
"@type": "sdo:value",
"standard_value": "19000.000000000000000000000000000000",
"standard_units": "nM",
"standard_type": "IC50",
"pchembl_value": "4.72",
"uo_units": "obo:UO_0000065",
"qudt_units": "qudt:NanoMOL-PER-L"
}
}

dp1_datum3 = {
"@id": "datum",
"@type": "sdo:None",
"value": {
"standard_flag": "1",
"@id": "value",
"@type": "sdo:value",
"activity_id": "16464576"
}
}

dp1 = {
"@id": "datapoint",
"@type": "sdo:datapoint",
"activity_id": 16464576,
"assay": "CHEMBL3767769",
"data": [dp1_datum1, dp1_datum2, dp1_datum3]
}

# Input datapoint 2
dp2 = {
"@id": "datapoint",
"annotation": "gb:P04524",
"conditions": "Observation",
"value": {
"@id": "textvalue",
"text":
"The solution was clear, no reagent precipitation was observed.",
"textype": "plain",
"language": "en-us"
}
}

dps = [dp1, dp2]

example.datapoint(dps)

Expand All @@ -218,10 +279,10 @@
'values_numpy_array': str(ser1_numpy_array),
'values_numpy_list': str(ser1_numpy_list),
'values_numpy_json': str(ser1_numpy_json)}
for k,v in ser1_dict.items():
dataser1.update({str(k):str(v)})
for k,v in ser1_dict_str.items():
dataser1.update({str('str_'+k):v})
for k, v in ser1_dict.items():
dataser1.update({str(k): str(v)})
for k, v in ser1_dict_str.items():
dataser1.update({str('str_'+k): v})

ser2_input = {'colA': [10, 20, 30]}
ser2_dataframe = pd.DataFrame(ser2_input)
Expand All @@ -244,10 +305,10 @@
'values_numpy_array': str(ser2_numpy_array),
'values_numpy_list': str(ser2_numpy_list),
'values_numpy_json': str(ser2_numpy_json)}
for k,v in ser2_dict.items():
dataser2.update({str(k):str(v)})
for k,v in ser2_dict_str.items():
dataser2.update({str('str_'+k):v})
for k, v in ser2_dict.items():
dataser2.update({str(k): str(v)})
for k, v in ser2_dict_str.items():
dataser2.update({str('str_'+k): v})

example.dataseries([dataser1, dataser2])

Expand Down
2 changes: 1 addition & 1 deletion scidatalib/io/rruff.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def _read_get_facets_section(rruff_dict: dict) -> dict:
facets = []
material = {
"@id": "material",
"@type": ["sdo:facet", "sdo:material"],
"@type": "sdo:material",
"name": rruff_dict.get("names", ""),
"materialType": rruff_dict.get("ideal chemistry", ""),
}
Expand Down
50 changes: 36 additions & 14 deletions scidatalib/scidata.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,46 +737,68 @@ def __iterate_function(self, it, level, cnt_index, cat_index):
"""
new_cat_index = cat_index.copy()

# If we simply have string, return, end recursion
if isinstance(it, str):
self.__addid(it)
return it, 1, cnt_index, cat_index

# Set the category
if '@id' in it:
category = it['@id']
elif 'descriptors' in it or 'identifiers' in it:
category = 'compound'
else:
category = 'undefined'

count = 1
# Increase count if already encountered category; initialize otherwise
if category in cnt_index:
count = cnt_index[category] + 1
cnt_index[category] += 1
else:
cnt_index[category] = 1

# Update state holding level of nesting and associated category
new_cat_index.update({level: category})

# Set the @id and @type based on the category and count
uid = ''
for cat in list(new_cat_index.values()):
uid += cat + '/' + str(count) + '/'
uid += cat + '/' + str(cnt_index[cat]) + '/'

# Loop over non-id or non-type entries to recursively process
# sub-elements of the list of objects
temp: dict = {'@id': uid, '@type': 'sdo:' + category}
for k in it.keys():
if k != '@id':
if isinstance(it[k], list):
for key, value in it.items():

# Already constructed the @id, so iterate only on non-@id entries
if key != '@id':
count = cnt_index[category]

# For list, recusively process elements in sub-list
if isinstance(value, list):
level += 1
for i, y in enumerate(it[k]):
it[k][i], category, count, new_cat_index = \
for i, y in enumerate(value):
value[i], category, count, new_cat_index = \
self.__iterate_function(
y, level, cnt_index, new_cat_index)
temp[k] = it[k]
temp[key] = value
level -= 1
elif isinstance(it[k], dict):

# For list, recusively process key-values in sub-dict
elif isinstance(value, dict):
level += 1
temp[k], category, count, new_cat_index = \
temp[key], category, count, new_cat_index = \
self.__iterate_function(
it[k], level, cnt_index, new_cat_index)
value, level, cnt_index, new_cat_index)
level -= 1

# Simply add the value to list of objects to return
else:
temp[k] = it[k]
self.__addid(it[k])
temp[key] = value
self.__addid(value)

# Remove the last added "leaf" level to trim the @id value correctly
new_cat_index.pop(level)
cnt_index[category] = 0
return temp, category, count, new_cat_index

@property
Expand Down
2 changes: 1 addition & 1 deletion tests/io/test_rruff.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def test_read_rruff(raman_soddyite_file):
assert len(system["facets"]) == 1
facet = system["facets"][0]
assert facet["@id"] == "material/1/"
assert len(facet["@type"]) == 2
assert facet["@type"] == "sdo:material"
assert facet["materialType"] == "(UO_2_)_2_SiO_4_·2H_2_O"
assert facet["name"] == "Soddyite"

Expand Down
95 changes: 95 additions & 0 deletions tests/test_scidata.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""pytest test class for scidata.py"""
import copy
from scidatalib.scidata import SciData
from datetime import datetime
import pytest
Expand Down Expand Up @@ -203,6 +204,100 @@ def test_datapoint(sd):
assert sd.datapoint([pnt]) == [out]


def test_datapoint_nested(sd):
"""Test multiple, nested datum in datapoints for correct enumeration"""
sd.namespaces({'gb': 'https://goldbook.iupac.org/terms/view/'})

# Input datapoint 1
dp1_datum1 = {
"@id": "datum",
"@type": "sdo:exptdata",
"type": "IC50",
"value": {
"@id": "value",
"@type": "sdo:value",
"relation": "=",
"units": "uM",
"value": "19.000000000000000000000000000000"
}
}

dp1_datum2 = {
"@id": "datum",
"@type": "sdo:deriveddata",
"value": {
"standard_relation": "=",
"@id": "value",
"@type": "sdo:value",
"standard_value": "19000.000000000000000000000000000000",
"standard_units": "nM",
"standard_type": "IC50",
"pchembl_value": "4.72",
"uo_units": "obo:UO_0000065",
"qudt_units": "qudt:NanoMOL-PER-L"
}
}

dp1_datum3 = {
"@id": "datum",
"@type": "sdo:None",
"value": {
"standard_flag": "1",
"@id": "value",
"@type": "sdo:value",
"activity_id": "16464576"
}
}

dp1 = {
"@id": "datapoint",
"@type": "sdo:datapoint",
"activity_id": 16464576,
"assay": "CHEMBL3767769",
"data": [dp1_datum1, dp1_datum2, dp1_datum3]
}

# Input datapoint 2
dp2 = {
"@id": "datapoint",
"@type": "sdo:datapoint",
"annotation": "gb:P04524",
"conditions": "Observation",
"value": {
"@id": "textvalue",
"@type": "sdo:textvalue",
"text":
"The solution was clear, no reagent precipitation was observed.", # noqa
"textype": "plain",
"language": "en-us"
}
}

# Create target output datapoints
out_dp1 = copy.deepcopy(dp1)

out_dp1["@id"] = "datapoint/1/"

out_dp1_datum1 = out_dp1["data"][0]
out_dp1_datum1["@id"] = "datapoint/1/datum/1/"
out_dp1_datum1["value"]["@id"] = "datapoint/1/datum/1/value/1/"

out_dp1_datum2 = out_dp1["data"][1]
out_dp1_datum2["@id"] = "datapoint/1/datum/2/"
out_dp1_datum2["value"]["@id"] = "datapoint/1/datum/2/value/1/"

out_dp1_datum3 = out_dp1["data"][2]
out_dp1_datum3["@id"] = "datapoint/1/datum/3/"
out_dp1_datum3["value"]["@id"] = "datapoint/1/datum/3/value/1/"

out_dp2 = copy.deepcopy(dp2)

out_dp2["@id"] = "datapoint/2/"
out_dp2["value"]["@id"] = "datapoint/2/textvalue/1/"

assert sd.datapoint([dp1, dp2]) == [out_dp1, out_dp2]


def test_datagroup_with_datapoints(sd):
sd.namespaces(
{
Expand Down

0 comments on commit 6a35ae7

Please sign in to comment.