add the support of client-level DP

fjxmlzn · fjxmlzn · commit 25cdf854a22f · 2025-02-20T01:01:34.000-08:00
diff --git a/pe/constant/data.py b/pe/constant/data.py
@@ -1,5 +1,7 @@
 #: The column name of the label ID
 LABEL_ID_COLUMN_NAME = "PE.LABEL_ID"
+#: The column name of the client ID (if using client-level DP)
+CLIENT_ID_COLUMN_NAME = "PE.CLIENT_ID"
 
 #: The column name of the clean histogram
 CLEAN_HISTOGRAM_COLUMN_NAME = "PE.CLEAN_HISTOGRAM"
diff --git a/pe/data/data.py b/pe/data/data.py
@@ -3,6 +3,7 @@
 import pandas as pd
 import numpy as np
 from pe.constant.data import LABEL_ID_COLUMN_NAME
+from pe.constant.data import CLIENT_ID_COLUMN_NAME
 
 
 class Data:
@@ -183,3 +184,34 @@ def concat(cls, data_list, metadata=None):
                 raise ValueError("Metadata must be the same")
             metadata = metadata_list[0]
         return Data(data_frame=pd.concat(data_frame_list), metadata=metadata)
+
+    def split_by_client(self):
+        """Split the data frame by client ID
+
+        :raises ValueError: If the client ID column is not in the data frame
+        :return: The list of data objects with the splited data
+        :rtype: list[:py:class:`pe.data.Data`]
+        """
+        if CLIENT_ID_COLUMN_NAME not in self.data_frame.columns:
+            raise ValueError(f"{CLIENT_ID_COLUMN_NAME} not in data frame")
+        grouped_data_frame = self.data_frame.groupby(CLIENT_ID_COLUMN_NAME)
+        return [Data(data_frame=data_frame, metadata=self.metadata) for _, data_frame in grouped_data_frame]
+
+    def split_by_index(self):
+        """Split the data frame by index
+
+        :return: The list of data objects with the splited data
+        :rtype: list[:py:class:`pe.data.Data`]
+        """
+        grouped_data_frame = self.data_frame.groupby(self.data_frame.index)
+        return [Data(data_frame=data_frame, metadata=self.metadata) for _, data_frame in grouped_data_frame]
+
+    def reset_index(self, **kwargs):
+        """Reset the index of the data frame
+
+        :param kwargs: The keyword arguments to pass to the pandas reset_index function
+        :type kwargs: dict
+        :return: A new :py:class:`pe.data.Data` object with the reset index data frame
+        :rtype: :py:class:`pe.data.Data`
+        """
+        return Data(data_frame=self.data_frame.reset_index(**kwargs), metadata=self.metadata)
diff --git a/pe/histogram/nearest_neighbors.py b/pe/histogram/nearest_neighbors.py
@@ -26,6 +26,7 @@ def __init__(
         api=None,
         num_nearest_neighbors=1,
         backend="auto",
+        vote_normalization_level="sample",
     ):
         """Constructor.
 
@@ -58,6 +59,10 @@ def __init__(
             private samples is large. It requires the installation of `faiss-gpu` or `faiss-cpu` package. See
             https://faiss.ai/
         :type backend: str, optional
+        :param vote_normalization_level: The level of normalization for the votes. It should be one of the following:
+            "sample" (normalize the votes from each private sample to have l2 norm = 1), "client" (normalize the votes
+            from all private samples of the same client to have l2 norm = 1). Defaults to "sample"
+        :type vote_normalization_level: str, optional
         :raises ValueError: If the `api` is not provided when `lookahead_degree` is greater than 0
         :raises ValueError: If the `backend` is unknown
         """
@@ -86,6 +91,8 @@ def __init__(
         else:
             raise ValueError(f"Unknown backend: {backend}")
 
+        self._vote_normalization_level = vote_normalization_level
+
     def _log_lookahead(self, syn_data, lookahead_id):
         """Log the lookahead data.
 
@@ -163,6 +170,7 @@ def compute_histogram(self, priv_data, syn_data):
         :type priv_data: :py:class:`pe.data.Data`
         :param syn_data: The synthetic data
         :type syn_data: :py:class:`pe.data.Data`
+        :raises ValueError: If the `vote_normalization_level` is unknown
         :return: The private data, possibly with the additional embedding column, and the synthetic data, with the
             computed histogram in the column :py:const:`pe.constant.data.CLEAN_HISTOGRAM_COLUMN_NAME` and possibly with
             the additional embedding column
@@ -189,10 +197,22 @@ def compute_histogram(self, priv_data, syn_data):
         )
         self._log_voting_details(priv_data=priv_data, syn_data=syn_data, ids=ids)
 
-        counter = Counter(list(ids.flatten()))
+        priv_data = priv_data.reset_index(drop=True)
+        if self._vote_normalization_level == "client":
+            priv_data_list = priv_data.split_by_client()
+        elif self._vote_normalization_level == "sample":
+            priv_data_list = priv_data.split_by_index()
+        else:
+            raise ValueError(f"Unknown vote normalization level: {self._vote_normalization_level}")
+
         count = np.zeros(shape=syn_embedding.shape[0], dtype=np.float32)
-        count[list(counter.keys())] = list(counter.values())
-        count /= np.sqrt(self._num_nearest_neighbors)
+        for sub_priv_data in priv_data_list:
+            sub_count = np.zeros(shape=syn_embedding.shape[0], dtype=np.float32)
+            sub_ids = ids[sub_priv_data.data_frame.index]
+            counter = Counter(list(sub_ids.flatten()))
+            sub_count[list(counter.keys())] = list(counter.values())
+            sub_count /= np.linalg.norm(sub_count)
+            count += sub_count
 
         syn_data.data_frame[CLEAN_HISTOGRAM_COLUMN_NAME] = count