In [1]:
#DeepSeek
import pandas as pd
import numpy as np
from diffprivlib.mechanisms import Laplace

# Step 1: Load the dataset
file_path = "/Users/felipecastanogonzalez/Downloads/ChfSynthData-13_09_2024.csv"
df = pd.read_csv(file_path)

# Step 2: Filter and clean the data
# Drop duplicates
df = df.drop_duplicates()

# Remove rows with missing values in specified columns
required_columns = ["FirstName", "LastName", "DobDay", "DobMonth", "DobYear"]
df = df.dropna(subset=required_columns)

# Convert DobYear to integer
df["DobYear"] = df["DobYear"].astype(int)

# Step 3: Keep only the required columns
df = df[required_columns]

# Step 4: Create Decade column
df["Decade"] = (df["DobYear"] // 10) * 10

# Step 5: Count records by decade
true_counts = df["Decade"].value_counts().sort_index().reset_index()
true_counts.columns = ["Decade", "TrueCount"]

# Step 6: Apply Laplace mechanism for differential privacy
epsilon = 0.1
sensitivity = 1
laplace = Laplace(epsilon=epsilon, sensitivity=sensitivity)

# Initialize list for private counts
private_counts = []

# Add noise to each count
for count in true_counts["TrueCount"]:
    noisy_count = laplace.randomise(count)
    private_counts.append(max(0, round(noisy_count)))  # Ensure non-negative counts

# Step 7: Create comparison DataFrame
result_df = true_counts.copy()
result_df["PrivateCount"] = private_counts

# Step 8: Print the comparison table
print("\nComparison of True vs Differentially Private Counts by Decade:")
print(result_df.to_string(index=False))


Comparison of True vs Differentially Private Counts by Decade:
 Decade  TrueCount  PrivateCount
   1900       1650          1660
   1910       1691          1676
   1920       1500          1494
   1930       1572          1572
   1940       1692          1672
   1950       1625          1621
   1960       1433          1433
   1970       1565          1579
   1980       1626          1608
   1990       1633          1628
   2000          2             6
   2010          2            34
