Use RangeIndex in nodes dataframe and improve _check_id (#207)

If the index is a regular index, when accessing df.loc[x] for the first time (when x is an array, and not an integer), Pandas calls the cached property Index.is_unique, that can take a few seconds with big dataframes: Using a RangeIndex solves the issue, because is_unique is always True.
BlueBrain · May 9, 2023 · c2d11e4 · c2d11e4
1 parent 95e6006
commit c2d11e4
Showing 1 changed file with 4 additions and 4 deletions.
diff --git a/bluepysnap/nodes/node_population.py b/bluepysnap/nodes/node_population.py
@@ -100,7 +100,7 @@ def _data(self):
         categoricals = nodes.enumeration_names
 
         _all = nodes.select_all()
-        result = pd.DataFrame(index=np.arange(_all.flat_size))
+        result = pd.DataFrame(index=pd.RangeIndex(_all.flat_size))
 
         for attr in sorted(nodes.attribute_names):
             if attr in categoricals:
@@ -240,7 +240,7 @@ def property_dtypes(self):
 
     def _check_id(self, node_id):
         """Check that single node ID belongs to the circuit."""
-        if node_id not in self._data.index:
+        if node_id < 0 or node_id >= len(self._data.index):
             raise BluepySnapError(f"node ID not found: {node_id} in population '{self.name}'")
 
     def _check_ids(self, node_ids):
@@ -254,9 +254,9 @@ def _check_ids(self, node_ids):
         else:
             max_id = max(node_ids)
             min_id = min(node_ids)
-        if min_id < 0 or max_id >= self._data.index.shape[0]:
+        if min_id < 0 or max_id >= len(self._data.index):
             raise BluepySnapError(
-                f"All node IDs must be >= 0 and < {self._data.index.shape[0]} "
+                f"All node IDs must be >= 0 and < {len(self._data.index)} "
                 f"for population '{self.name}'"
             )