Skip to content

Commit

Permalink
Fix warnings with Pandas 1.5.0 and improve performance of get (#168)
Browse files Browse the repository at this point in the history
* Fix warnings with Pandas 1.5.0

* Improve performances of `nodes.get` and `edges.get`

Some column data types in the nodes and edges DataFrames
returned by the `get()` method may be `float` or `int` instead of `object`.
  • Loading branch information
GianlucaFicarelli committed Oct 3, 2022
1 parent d87e06f commit 02da3ed
Show file tree
Hide file tree
Showing 10 changed files with 239 additions and 47 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ Improvements
- accessing data in Circuit and Simulation
- Circuit validation changed to be more config-driven
- it now only validates objects defined in the circuit configuration file
- Improved performance when loading nodes and edges from a circuit.
- Fixed warnings with Pandas 1.5.0

Breaking Changes
~~~~~~~~~~~~~~~~
Expand All @@ -26,7 +28,7 @@ Breaking Changes
- non-BBP Sonata circuit validation was removed
- The NodeStorage & EdgeStorage classes were removed
- point_neuron is no longer supported

- Some column data types in the nodes and edges DataFrames returned by the `get()` method may be `float` or `int` instead of `object`.

Version v0.13.1
---------------
Expand Down
5 changes: 5 additions & 0 deletions bluepysnap/circuit_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ def __init__(self, index, sort_index=True):
index = index.sortlevel()[0]
self.index = index

@property
def index_schema(self):
"""Return an empty index with the same names of the wrapped index."""
return pd.MultiIndex.from_tuples([], names=self.index.names)

@classmethod
def _instance(cls, index, sort_index=True):
"""The instance returned by the functions."""
Expand Down
33 changes: 22 additions & 11 deletions bluepysnap/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def _update(d, index, value):

res = {}
for pop in self.values():
for varname, dtype in pop.property_dtypes.iteritems():
for varname, dtype in pop.property_dtypes.items():
_update(res, varname, dtype)
return pd.Series(res)

Expand Down Expand Up @@ -146,7 +146,7 @@ def ids(self, group=None, sample=None, limit=None):

@abc.abstractmethod
def get(self, group=None, properties=None):
"""Returns the properties of a the NetworkObject."""
"""Returns the properties of the NetworkObject."""
ids = self.ids(group)
properties = utils.ensure_list(properties)
# We don t convert to set properties itself to keep the column order.
Expand All @@ -156,14 +156,25 @@ def get(self, group=None, properties=None):
if unknown_props:
raise BluepySnapError(f"Unknown properties required: {unknown_props}")

res = pd.DataFrame(index=ids.index, columns=properties)
for name, pop in self.items():
# Retrieve the dtypes of the selected properties.
# However, the int dtype may not be preserved if some values are NaN.
dtypes = {
column: dtype
for column, dtype in self.property_dtypes.items()
if column in properties_set
}
dataframes = [pd.DataFrame(columns=properties, index=ids.index_schema).astype(dtypes)]
for name, pop in sorted(self.items()):
# since ids is sorted, global_pop_ids should be sorted as well
global_pop_ids = ids.filter_population(name)
pop_ids = global_pop_ids.get_ids()
pop_properties = properties_set & pop.property_names
# indices from Population and get functions are different so I cannot
# use a dataframe equal directly and properties have different types so cannot use a
# multi dim numpy array
for prop in pop_properties:
res.loc[global_pop_ids.index, prop] = pop.get(pop_ids, prop).to_numpy()
return res.sort_index()
if len(pop_ids) > 0:
pop_properties = properties_set & pop.property_names
# Since the columns are passed as Series, index cannot be specified directly.
# However, it's a bit more performant than converting the Series to numpy arrays.
pop_df = pd.DataFrame({prop: pop.get(pop_ids, prop) for prop in pop_properties})
pop_df.index = global_pop_ids.index
dataframes.append(pop_df)
res = pd.concat(dataframes)
assert res.index.is_monotonic_increasing, "The index should be already sorted"
return res
3 changes: 2 additions & 1 deletion bluepysnap/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -696,5 +696,6 @@ def get(self, group=None, properties=None): # pylint: disable=arguments-differ
for the `properties` argument.
"""
if properties is None:
properties = self.property_names
# not strictly needed, but ensure that the properties are always in the same order
properties = sorted(self.property_names)
return super().get(group, properties)
4 changes: 3 additions & 1 deletion bluepysnap/spike_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ def get(self, group=None, t_start=None, t_stop=None):
data=[], index=pd.Index([], name="times"), name=series_name, dtype=IDS_DTYPE
)

# pylint: disable=unsubscriptable-object
res = pd.DataFrame(data=res, columns=[series_name, "times"]).set_index("times")[series_name]
if self._sorted_by != "by_time":
res.sort_index(inplace=True)
Expand Down Expand Up @@ -228,7 +229,8 @@ def log(self):
path = Path(self.config.output_dir) / self.config.log_file
if not path.exists():
raise BluepySnapError("Cannot find the log file for the spike report.")
yield path.open("r", encoding="utf-8")
with path.open("r", encoding="utf-8") as f:
yield f

@cached_property
def _spike_reader(self):
Expand Down
3 changes: 1 addition & 2 deletions tests/test_circuit.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import bluepysnap.circuit as test_module
from bluepysnap.edges import EdgePopulation, Edges
from bluepysnap.exceptions import BluepySnapError
from bluepysnap.nodes import NodePopulation, Nodes

from utils import TEST_DATA_DIR, copy_test_data, edit_config
Expand All @@ -26,7 +25,7 @@ def test_all():
assert isinstance(circuit.nodes["default"], NodePopulation)
assert isinstance(circuit.nodes["default2"], NodePopulation)
assert sorted(circuit.node_sets) == sorted(
json.load(open(str(TEST_DATA_DIR / "node_sets.json")))
json.loads((TEST_DATA_DIR / "node_sets.json").read_text())
)


Expand Down
122 changes: 107 additions & 15 deletions tests/test_edges.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,8 @@ def test_ids(self):
assert tested == expected

def test_get(self):
with pytest.raises(BluepySnapError):
self.test_obj.get(properties=["other2", "unknown"])
with pytest.raises(BluepySnapError, match="You need to set edge_ids in get."):
self.test_obj.get(properties=["other2"])

ids = CircuitEdgeIds.from_dict({"default": [0, 1, 2, 3], "default2": [0, 1, 2, 3]})
with pytest.deprecated_call(
Expand All @@ -314,40 +314,132 @@ def test_get(self):

# tested columns
tested = self.test_obj.get(ids, properties=["other2", "other1", "@source_node"])
assert tested.shape == (self.test_obj.size, 3)
assert list(tested) == ["other2", "other1", "@source_node"]
expected = pd.DataFrame(
{
"other2": np.array([np.NaN, np.NaN, np.NaN, np.NaN, 10, 11, 12, 13], dtype=float),
"other1": np.array(
[np.NaN, np.NaN, np.NaN, np.NaN, "A", "B", "C", "D"], dtype=object
),
"@source_node": np.array([2, 0, 0, 2, 2, 0, 0, 2], dtype=int),
},
index=pd.MultiIndex.from_tuples(
[
("default", 0),
("default", 1),
("default", 2),
("default", 3),
("default2", 0),
("default2", 1),
("default2", 2),
("default2", 3),
],
names=["population", "edge_ids"],
),
)
pdt.assert_frame_equal(tested, expected)

tested = self.test_obj.get(
CircuitEdgeIds.from_dict({"default2": [0, 1, 2, 3]}),
properties=["other2", "other1", "@source_node"],
)
assert tested.shape == (4, 3)
# correct ordering when setting the dataframe with the population dataframe
assert tested.loc[("default2", 0)].tolist() == [10, "A", 2]
with pytest.raises(KeyError):
expected = pd.DataFrame(
{
"other2": np.array([10, 11, 12, 13], dtype=np.int32),
"other1": np.array(["A", "B", "C", "D"], dtype=object),
"@source_node": np.array([2, 0, 0, 2], dtype=int),
},
index=pd.MultiIndex.from_tuples(
[
("default2", 0),
("default2", 1),
("default2", 2),
("default2", 3),
],
names=["population", "edge_ids"],
),
)
pdt.assert_frame_equal(tested, expected)

with pytest.raises(KeyError, match="'default'"):
tested.loc[("default", 0)]

tested = self.test_obj.get(
CircuitEdgeIds.from_dict({"default": [0, 1, 2, 3]}),
properties=["other2", "other1", "@source_node"],
)
assert tested.shape == (4, 3)
assert tested.loc[("default", 0)].tolist() == [np.NaN, np.NaN, 2]
assert tested.loc[("default", 1)].tolist() == [np.NaN, np.NaN, 0]
expected = pd.DataFrame(
{
"other2": np.array([np.NaN, np.NaN, np.NaN, np.NaN], dtype=float),
"other1": np.array([np.NaN, np.NaN, np.NaN, np.NaN], dtype=object),
"@source_node": np.array([2, 0, 0, 2], dtype=int),
},
index=pd.MultiIndex.from_tuples(
[
("default", 0),
("default", 1),
("default", 2),
("default", 3),
],
names=["population", "edge_ids"],
),
)
pdt.assert_frame_equal(tested, expected)

tested = self.test_obj.get(ids, properties="@source_node")
assert tested["@source_node"].tolist() == [2, 0, 0, 2, 2, 0, 0, 2]
expected = pd.DataFrame(
{
"@source_node": np.array([2, 0, 0, 2, 2, 0, 0, 2], dtype=int),
},
index=pd.MultiIndex.from_tuples(
[
("default", 0),
("default", 1),
("default", 2),
("default", 3),
("default2", 0),
("default2", 1),
("default2", 2),
("default2", 3),
],
names=["population", "edge_ids"],
),
)
pdt.assert_frame_equal(tested, expected)

tested = self.test_obj.get(ids, properties="other2")
assert tested["other2"].tolist() == [np.NaN, np.NaN, np.NaN, np.NaN, 10, 11, 12, 13]
expected = pd.DataFrame(
{
"other2": np.array([np.NaN, np.NaN, np.NaN, np.NaN, 10, 11, 12, 13], dtype=float),
},
index=pd.MultiIndex.from_tuples(
[
("default", 0),
("default", 1),
("default", 2),
("default", 3),
("default2", 0),
("default2", 1),
("default2", 2),
("default2", 3),
],
names=["population", "edge_ids"],
),
)
pdt.assert_frame_equal(tested, expected)

with pytest.raises(BluepySnapError):
with pytest.raises(BluepySnapError, match="Unknown properties required: {'unknown'}"):
self.test_obj.get(ids, properties=["other2", "unknown"])

with pytest.raises(BluepySnapError):
with pytest.raises(BluepySnapError, match="Unknown properties required: {'unknown'}"):
self.test_obj.get(ids, properties="unknown")

with pytest.deprecated_call():
with pytest.deprecated_call(
match=(
"Returning ids with get/properties is deprecated and will be removed in 1.0.0. "
"Please use Edges.ids instead."
)
):
self.test_obj.get(ids)

def test_properties_deprecated(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/test_node_sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def test_get(self):
}

def test_iter(self):
expected = set(json.load(open(str(TEST_DATA_DIR / "node_sets_file.json"))))
expected = set(json.loads((TEST_DATA_DIR / "node_sets_file.json").read_text()))
assert set(self.test_obj) == expected


Expand Down

0 comments on commit 02da3ed

Please sign in to comment.