In [3]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# Privacy Metrics

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(0)
data = pd.DataFrame({
    'Age': np.random.randint(18, 70, size=100),
    'Income': np.random.randint(30000, 100000, size=100),
    'Health_Status': np.random.choice(['Good', 'Average', 'Poor'], size=100),
    'Cardio_Disease': np.random.choice([0, 1], size=100)  # 0 for No, 1 for Yes
})

data.head(10)

# 1. Anonymization Level (k-anonymity)

**Explanation**: k-Anonymity ensures that each record is indistinguishable from at least \(k-1\) other records regarding certain 'quasi-identifier' attributes.

**Formula**: A dataset is k-anonymous if each record appears with at least \(k-1\) others with identical quasi-identifiers.


In [None]:
# Example about k value
k = 5

# counting
k_anonymity_counts = data.groupby(['Age', 'Health_Status']).size()
k_anonymity_violations = k_anonymity_counts[k_anonymity_counts < k]

# visualization
plt.rcParams["figure.figsize"] = (12, 4)
k_anonymity_violations.plot(kind='bar', title='k-anonymity violations')

# 2. Information Loss

**Explanation**: Information loss measures the loss of data utility due to anonymization techniques.

**Formulas**: Information loss can be quantified in several ways depending on the context and the specific technique used for data anonymization. Common formulas include:

1. **General Information Loss**:
   - Formula: $IL = \frac{1}{N} \sum_{i=1}^{N} d(r_i, r'_i)$
   - Here, $N$ is the number of records, $r_i$ is the original record, $r'_i$ is the anonymized record, and $d$ is a distance function measuring the difference between the original and anonymized data.

2. **Normalized Certainty Penalty (NCP)**:
   - Formula: $NCP = \frac{\sum_{i=1}^{N} \sum_{j=1}^{m} \text{NCP}(A_j, r_i)}{N \times m}$
   - $N$ is the number of records, $m$ is the number of attributes, $A_j$ is the $j$th attribute, and $\text{NCP}(A_j, r_i)$ is the certainty penalty for attribute $A_j$ in record $r_i$.

3. **Average Equivalence Class Size**:
   - Formula: $AECS = \frac{1}{|D|} \sum_{e \in D} |e|$
   - $D$ represents the set of all equivalence classes, and $|e|$ is the size of an equivalence class.


In [None]:
# Example: Generalizing 'Age' to age groups
data['Age_Group'] = pd.cut(data['Age'], bins=[18, 30, 40, 50, 60, 70], right=False)

# Comparing distributions before and after generalization
plt.rcParams["figure.figsize"] = (4, 3)
data['Age'].plot(kind='hist', alpha=0.5, label='Original')
data['Age_Group'].value_counts().sort_index().plot(kind='bar', alpha=0.5, label='Generalized', color='red')

# 3. Entropy

**Explanation**: Entropy is a measure of the unpredictability or randomness in a dataset.

**Formula**: $H(X) = -\sum p(x_i) \log_2 p(x_i)$


In [None]:
from scipy.stats import entropy

# Calculate entropy of 'Health_Status'
health_entropy = entropy(data['Health_Status'].value_counts(normalize=True))
health_entropy

# 4. Information Gain (Mutual Information)

**Explanation**: Measures the amount of information obtained about one random variable by observing another.

**Formula**: $G(X; Y) = H(X) - H(X|Y)$


In [None]:
from sklearn.metrics import mutual_info_score

# Mutual information between 'Health_Status' and 'Cardio_Disease'
info_gain = mutual_info_score(data['Health_Status'], data['Cardio_Disease'])
info_gain

# 5. ε-Differential Privacy

**Explanation**: Ensures that the addition or removal of a single database item does not significantly affect the outcome of any analysis.

**Formula**: The key formula is based on the concept of privacy budget (ε) and how it controls the addition of noise:

1. **Basic Definition of ε-Differential Privacy**:
   - Formula: $P(\mathcal{A}(D) \in S) \leq e^\varepsilon \times P(\mathcal{A}(D') \in S)$
   - $\mathcal{A}$ is the randomized algorithm, $D$ and $D'$ are datasets differing in at most one element, $S$ is any subset of outputs, and $\varepsilon$ (epsilon) is the privacy budget.

2. **Laplace Mechanism**:
   - Formula: $f_{\text{Laplace}}(x|\mu, b) = \frac{1}{2b} \exp\left(-\frac{|x - \mu|}{b}\right)$
   - Here, $\mu$ is the location parameter (typically the true value of a function), and $b$ is the scale parameter, often set as $\frac{\Delta f}{\varepsilon}$, where $\Delta f$ is the sensitivity of the function.

3. **Gaussian Mechanism**:
   - Formula: $f_{\text{Gaussian}}(x|\mu, \sigma) = \frac{1}{\sigma\sqrt{2\pi}} \exp\left(-\frac{(x - \mu)^2}{2\sigma^2}\right)$
   - $\mu$ is the mean (true value), and $\sigma$ is the standard deviation, related to $\varepsilon$ and $\delta$ (an additional parameter in differential privacy) through a more complex relationship involving sensitivity.

In [None]:
# Implementing a simple ε-differential privacy mechanism
def laplace_mechanism(value, epsilon=1.0):
    noise = np.random.laplace(0, 1/epsilon)
    return value + noise

epsilon_arr = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
income_sum = []

for e in epsilon_arr:
    if e==0.0:
        income_sum.append(data['Income'].sum())
    else:
        # Applying differential privacy to the sum of 'Income'
        sum = laplace_mechanism(data['Income'].sum(), epsilon=e)
        income_sum.append(sum)

# Visualization
fig, ax = plt.subplots()
plt.rcParams["figure.figsize"] = (6, 3)
plt.xlabel("Epsilon")
plt.ylabel("Income Sum")
ax.set_ylim(min(income_sum)-1, max(income_sum)+1)
plt.bar(list(map(str, epsilon_arr)), income_sum)

# 6. Privacy Loss

**Explanation**: Privacy loss quantifies the increase in risk of revealing private information due to data processing.

**Formula**: Often calculated as a comparison between the probabilities of data output with and without a specific input.


In [None]:
# Comparing the distribution of original and DP-protected data
original_income = data['Income']
dp_protected_income = original_income.apply(lambda x: laplace_mechanism(x, epsilon=0.1))

# Visualization
plt.rcParams["figure.figsize"] = (4, 3)
sns.histplot(original_income, alpha=0.5, label='Original', kde=True)
sns.histplot(dp_protected_income, alpha=0.5, label='DP Protected', color='green', kde=True)

# 7. Re-identification Risk

**Explanation**: Measures the risk of identifying individuals in anonymized data. Higher risk implies a greater chance of re-identification.

**Formula**: Often calculated based on the uniqueness of quasi-identifiers in the dataset.


In [None]:
# Estimating risk based on the uniqueness of quasi-identifiers
plt.rcParams["figure.figsize"] = (12, 3)

risk_scores = data.groupby(['Age', 'Health_Status']).size() / len(data)
risk_scores.plot(kind='bar', title='Re-identification Risk Scores')

# 8. Data Distortion

**Explanation**: Quantifies the alteration of data due to anonymization techniques like generalization or noise addition.

**Formula**: Can be measured as the difference between original and distorted data distributions.


In [None]:
# Comparing original and generalized data distributions
original_data = data['Income']

# Generalizing income
generalized_data = original_data.apply(lambda x: round(x, -3))

# Visualization
plt.rcParams["figure.figsize"] = (4, 3)
sns.histplot(original_data, alpha=0.5, label='Original')
sns.histplot(generalized_data, alpha=0.5, label='Generalized', color='green')

# 9. Generalization Error

**Explanation**: Measures the loss of information due to generalizing data values into broader categories.

**Formula**: Difference between detailed and generalized data values.



The generalization of age values into broader categories serves as a privacy-preserving measure by reducing the specificity of data points. While this approach helps in protecting identities by making specific records less identifiable, it also introduces the information loss. The generalization error quantifies this loss by comparing the original data distribution with the generalized distribution. In essence, it measures the extent to which data usefulness is compromised for the sake of privacy.

Visualizing the original and generalized distributions side-by-side allows to observe the impact of generalization on data granularity. A detailed distribution reveals individual variations, whereas a generalized distribution packs these details into categories. This comparison highlights the balance that must be achieved between privacy protection and the preservation of data utility.


In [None]:
# Create age groups
data['Age_Group'] = pd.cut(data['Age'], bins=[0, 30, 40, 50, 60, 70, np.inf], right=False, labels=['<30', '30-39', '40-49', '50-59', '60-69', '70+'])

plt.figure(figsize=(16, 6))

# Original 'Age' Distribution
plt.subplot(1, 2, 1)
data['Age'].plot(kind='hist', bins=20, alpha=0.7, label='Original Age')
plt.xlabel('Age (Years)')
plt.ylabel('Frequency')
plt.title('Original Age Distribution')

# Generalized 'Age_Group' Distribution
plt.subplot(1, 2, 2)
data['Age_Group'].value_counts().sort_index().plot(kind='bar', alpha=0.7, color='orange', label='Age Group')
plt.xlabel('Age Group')
plt.ylabel('Frequency')
plt.title('Generalized Age Group Distribution')

plt.tight_layout()
plt.show()


# Comparison plot

original_age = data['Age']
generalized_age = data['Age_Group'].cat.codes  # Convert categories to numerical codes for visualization

plt.figure(figsize=(16, 8))

# Create a representative age for each age group for comparison
age_group_mapping = {'<30': 30, '30-39': 40, '40-49': 50, '50-59': 60, '60-69': 70, '70+': (max(data["Age"]) if max(data["Age"])>70 else 100)}
data['Generalized_Age_Representative'] = data['Age_Group'].map(age_group_mapping)

plt.figure(figsize=(16, 8))

# Histogram for Original Age
plt.hist(data['Age'], bins=20, alpha=0.7, label='Original Age')

# Overlay with representative points for each age group
colors = ['red', 'blue', 'green', 'purple', 'orange', 'brown']

for (group, representative_age), color in zip(age_group_mapping.items(), colors):
    plt.axvline(x=representative_age, color=color, linestyle='--', lw=2,
                label=f"{group} (Gen. Age ~{representative_age} years)")

plt.xlabel('Age (Years) / Represented Age Groups')
plt.ylabel('Frequency')
plt.title('Overlayed Representation of Original and Generalized Age Distributions')
plt.legend()

plt.show()

# 10. Disclosure Risk

**Explanation**: Assesses the likelihood of disclosing sensitive information in anonymized data.

**Formula**: Calculated based on the probability of correctly inferring sensitive attributes.


In [None]:
# Estimating risk based on attribute inference
disclosure_risk = data['Cardio_Disease'].value_counts(normalize=True)

# Vizualization
plt.rcParams["figure.figsize"] = (4, 3)
disclosure_risk.plot(kind='bar', title='Disclosure Risk Estimation')

# 11. Utility Loss

**Explanation**: Reflects the reduction in data utility due to privacy-preserving techniques.

**Formula**: Contrast between utility of original and privacy-preserved data.


In [None]:
# Utility loss in data generalization
utility_loss = (original_data.mean() - generalized_data.mean()) / original_data.mean()
utility_loss

# 12. Query Answering Accuracy

**Explanation**: Measures the accuracy of responses to queries on privacy-preserved data.

**Formula**: Accuracy = (Correct answers) / (Total queries)


In [None]:
# Accuracy of queries on DP-protected data
dp_protected_data = data['Income'].apply(lambda x: laplace_mechanism(x, epsilon=0.1))
accuracy = np.mean(dp_protected_data.round() == original_data.round())
accuracy