### Code by Denis Loechel as part of master thesis on synthetic data generation

In [24]:
#Importing packages
import pandas as pd
from sdmetrics.single_table import NumericalLR
from sdmetrics.single_table import LogisticDetection
import json
import warnings

In [25]:
# Ignoring warnings 
warnings.filterwarnings("ignore")

# CTGAN

## 500 epochs

### Privacy Inference

In [28]:
# Importing our datasets
real_table=pd.read_csv("20230308_Updated_HR_dataset.csv")
synthetic_table=pd.read_csv("synthetic_dataset_CTGAN_epochs500.csv")

# Running the numerical logistic regression on the real and synthetic dataset
NumericalLR.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
    # key fields that the attacker will have access to
    key_fields=["GenderID", "MarriedID", "YearofHire", "Absences", "DaysLateLast30"],
    # key fields we would like to protect from inference attack
    sensitive_fields=["CitizenDesc", "Termd", "EmpID", "Salary", "DeptID", "ManagerID", "PositionID"]
)

0.7398083260200553

### Logistic Detection

In [29]:
# Importing our datasets
real_table=pd.read_csv("20230308_Updated_HR_dataset.csv")
synthetic_table=pd.read_csv("synthetic_dataset_CTGAN_epochs500.csv")

# Running the logistic detection model metrics to calculate how difficult 
#it is to tell apart the real data from the synthetic data
LogisticDetection.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
metadata=json.load(open("metadata.json", "r"))
)

0.14657183221309433

## 1000 epochs

### Privacy Inference

In [27]:
# Importing our datasets
real_table=pd.read_csv("20230308_Updated_HR_dataset.csv")
synthetic_table=pd.read_csv("synthetic_dataset_CTGAN_epochs1000.csv")

# Running the numerical logistic regression on the real and synthetic dataset
NumericalLR.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
    # key fields that the attacker will have access to
    key_fields=["GenderID", "MarriedID", "YearofHire", "Absences", "DaysLateLast30"],
    # key fields we would like to protect from inference attack
    sensitive_fields=["CitizenDesc", "Termd", "EmpID", "Salary", "DeptID", "ManagerID", "PositionID"]
)

0.6904482803555052

### Logistic Detection

In [30]:
# Importing our datasets
real_table=pd.read_csv("20230308_Updated_HR_dataset.csv")
synthetic_table=pd.read_csv("synthetic_dataset_CTGAN_epochs1000.csv")

# Running the logistic detection model metrics to calculate how difficult 
#it is to tell apart the real data from the synthetic data
LogisticDetection.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
metadata=json.load(open("metadata.json", "r"))
)

0.21242895195419464

## 1500 epochs

### Privacy Inference

In [123]:
# Importing our datasets
real_table=pd.read_csv("20230308_Updated_HR_dataset.csv")
synthetic_table=pd.read_csv("synthetic_dataset_CTGAN_epochs1500.csv")

# Running the numerical logistic regression on the real and synthetic dataset
NumericalLR.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
    # key fields that the attacker will have access to
    key_fields=["GenderID", "MarriedID", "YearofHire", "Absences", "DaysLateLast30"],
    # key fields we would like to protect from inference attack
    sensitive_fields=["CitizenDesc", "Termd", "EmpID", "Salary", "DeptID", "ManagerID", "PositionID"]
)

0.8520271510772343

### Logistic Detection

In [31]:
# Importing our datasets
real_table=pd.read_csv("20230308_Updated_HR_dataset.csv")
synthetic_table=pd.read_csv("synthetic_dataset_CTGAN_epochs1500.csv")

# Running the logistic detection model metrics to calculate how difficult 
#it is to tell apart the real data from the synthetic data
LogisticDetection.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
metadata=json.load(open("metadata.json", "r"))
)

0.21177241722678597

## 2000 epochs

### Privacy Inference

In [44]:
# Importing our datasets
real_table=pd.read_csv("20230308_Updated_HR_dataset.csv")
synthetic_table=pd.read_csv("synthetic_dataset_CTGAN_epochs2000.csv")

# Running the numerical logistic regression on the real and synthetic dataset
NumericalLR.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
    # key fields that the attacker will have access to
    key_fields=["GenderID", "MarriedID", "YearofHire", "Absences", "DaysLateLast30"],
    # key fields we would like to protect from inference attack
    sensitive_fields=["CitizenDesc", "Termd", "EmpID", "Salary", "DeptID", "ManagerID", "PositionID"]
)

0.7619436709693943

### Logistic Detection

In [45]:
# Importing our datasets
real_table=pd.read_csv("20230308_Updated_HR_dataset.csv")
synthetic_table=pd.read_csv("synthetic_dataset_CTGAN_epochs2000.csv")

# Running the logistic detection model metrics to calculate how difficult 
#it is to tell apart the real data from the synthetic data
LogisticDetection.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
metadata=json.load(open("metadata.json", "r"))
)

0.1631172392332586

# CopulaGAN

## 500 epochs

### Privacy Inference

In [36]:
# Importing our datasets
real_table=pd.read_csv("20230308_Updated_HR_dataset.csv")
synthetic_table=pd.read_csv("synthetic_dataset_CopulaGANepochs500.csv")

# Running the numerical logistic regression on the real and synthetic dataset
NumericalLR.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
    # key fields that the attacker will have access to
    key_fields=["GenderID", "MarriedID", "YearofHire", "Absences", "DaysLateLast30"],
    # key fields we would like to protect from inference attack
    sensitive_fields=["CitizenDesc", "Termd", "EmpID", "Salary", "DeptID", "ManagerID", "PositionID"]
)

0.8228049495006052

### Logistic Detection

In [37]:
# Importing our datasets
real_table=pd.read_csv("20230308_Updated_HR_dataset.csv")
synthetic_table=pd.read_csv("synthetic_dataset_CopulaGANepochs500.csv")

# Running the logistic detection model metrics to calculate how difficult 
#it is to tell apart the real data from the synthetic data
LogisticDetection.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
metadata=json.load(open("metadata.json", "r"))
)

0.22911226039332833

## 1000 epochs

### Privacy Inference

In [38]:
# Importing our datasets
real_table=pd.read_csv("20230308_Updated_HR_dataset.csv")
synthetic_table=pd.read_csv("synthetic_dataset_CopulaGANepochs1000.csv")

# Running the numerical logistic regression on the real and synthetic dataset
NumericalLR.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
    # key fields that the attacker will have access to
    key_fields=["GenderID", "MarriedID", "YearofHire", "Absences", "DaysLateLast30"],
    # key fields we would like to protect from inference attack
    sensitive_fields=["CitizenDesc", "Termd", "EmpID", "Salary", "DeptID", "ManagerID", "PositionID"]
)

0.7758843983412084

### Logistic Detection

In [39]:
# Importing our datasets
real_table=pd.read_csv("20230308_Updated_HR_dataset.csv")
synthetic_table=pd.read_csv("synthetic_dataset_CopulaGANepochs1000.csv")

# Running the logistic detection model metrics to calculate how difficult 
#it is to tell apart the real data from the synthetic data
LogisticDetection.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
metadata=json.load(open("metadata.json", "r"))
)

0.10700664675130689

## 1500 epochs

### Privacy Inference

In [40]:
# Importing our datasets
real_table=pd.read_csv("20230308_Updated_HR_dataset.csv")
synthetic_table=pd.read_csv("synthetic_dataset_CopulaGANepochs1500.csv")

# Running the numerical logistic regression on the real and synthetic dataset
NumericalLR.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
    # key fields that the attacker will have access to
    key_fields=["GenderID", "MarriedID", "YearofHire", "Absences", "DaysLateLast30"],
    # key fields we would like to protect from inference attack
    sensitive_fields=["CitizenDesc", "Termd", "EmpID", "Salary", "DeptID", "ManagerID", "PositionID"]
)

0.820579482360653

### Logistic Detection

In [41]:
# Importing our datasets
real_table=pd.read_csv("20230308_Updated_HR_dataset.csv")
synthetic_table=pd.read_csv("synthetic_dataset_CopulaGANepochs1500.csv")

# Running the logistic detection model metrics to calculate how difficult 
#it is to tell apart the real data from the synthetic data
LogisticDetection.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
metadata=json.load(open("metadata.json", "r"))
)

0.07617271595718178

## 2000 epochs

### Privacy Inference

In [42]:
# Importing our datasets
real_table=pd.read_csv("20230308_Updated_HR_dataset.csv")
synthetic_table=pd.read_csv("synthetic_dataset_CopulaGANepochs2000.csv")

# Running the numerical logistic regression on the real and synthetic dataset
NumericalLR.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
    # key fields that the attacker will have access to
    key_fields=["GenderID", "MarriedID", "YearofHire", "Absences", "DaysLateLast30"],
    # key fields we would like to protect from inference attack
    sensitive_fields=["CitizenDesc", "Termd", "EmpID", "Salary", "DeptID", "ManagerID", "PositionID"]
)

0.7723378836127905

### Logistic Detection

In [43]:
# Importing our datasets
real_table=pd.read_csv("20230308_Updated_HR_dataset.csv")
synthetic_table=pd.read_csv("synthetic_dataset_CopulaGANepochs2000.csv")

# Running the logistic detection model metrics to calculate how difficult 
#it is to tell apart the real data from the synthetic data
LogisticDetection.compute(
    real_data=real_table,
    synthetic_data=synthetic_table,
metadata=json.load(open("metadata.json", "r"))
)

0.4099094224545682

# Documentation References 

Privacy Inference: https://docs.sdv.dev/sdmetrics/metrics/metrics-in-beta/privacy-against-inference
Privacy and Security metrics: https://docs.sdv.dev/sdmetrics/metrics/metrics-glossary
Logistic Detection: https://docs.sdv.dev/sdmetrics/metrics/metrics-in-beta/detection-single-table
NLR: https://docs.sdv.dev/sdmetrics/metrics/metrics-in-beta/privacy-against-inference#numerical-data