In [None]:
import pandas as pd
import numpy as np

In [None]:
tgt=pd.read_csv("C:/Users/admin/Documents/Diabetes_Research/1.Paper/NIDDK-DF_2024/Data_quality/NIDDK-DF-new-3targets.csv")

In [None]:
tgt = tgt[['Patient_id', 'cholesterol', 'glucose', 'hdl_chol', 'chol_hdl_ratio', 'age', 'gender',
       'height', 'weight', 'bmi', 'systolic_bp', 'diastolic_bp', 'waist',
       'hip', 'waist_hip_ratio', 'Outcome']]

In [None]:
syn_sdllm=pd.read_csv("C:/Users/admin/Documents/Diabetes_Research/1.Paper/NIDDK-DF_2024/NIDDK-DF_SD-LLM_2024_all.csv")

In [None]:
print(f"fetched synthetic data with {tgt.shape[0]:,} records and {tgt.shape[1]} attributes")
print(f"fetched synthetic data with {syn_sdllm.shape[0]:,} records and {syn_sdllm.shape[1]} attributes")

In [None]:
tgt['Outcome'].value_counts()

In [None]:
syn['Outcome'].value_counts()

In [None]:
syn_sdllm['Outcome'].value_counts()

In [None]:
tgt.info()

In [None]:
tgt = tgt.astype({'chol_hdl_ratio': float, 'bmi': float, 'waist_hip_ratio': float, 'Outcome': 'category'})

In [None]:
syn.info()

In [None]:
syn_sdllm.info()

In [None]:
syn_sdllm = syn_sdllm.astype({'chol_hdl_ratio': float, 'bmi': float, 'waist_hip_ratio': float, 'gender': 'category', 'Outcome': 'category'})

In [None]:
syn = syn[['Patient_id','cholesterol', 'glucose', 'hdl_chol', 'chol_hdl_ratio', 'age', 'gender',
       'height', 'weight', 'bmi', 'systolic_bp', 'diastolic_bp', 'waist',
       'hip', 'waist_hip_ratio', 'Outcome']]

In [None]:
syn_sdllm = syn_sdllm[['Patient_id','cholesterol', 'glucose', 'hdl_chol', 'chol_hdl_ratio', 'age', 'gender',
       'height', 'weight', 'bmi', 'systolic_bp', 'diastolic_bp', 'waist',
       'hip', 'waist_hip_ratio', 'Outcome']]

In [None]:
# Import label encoder 
from sklearn import preprocessing 

label_encoder = preprocessing.LabelEncoder() 

# Encode labels in column 'gender'. 
syn['gender']= label_encoder.fit_transform(syn['gender']) 

syn['gender'].unique() 

# Encode labels in column 'gender'. 
tgt['gender']= label_encoder.fit_transform(tgt['gender']) 

tgt['gender'].unique() 

#### For more details
https://mostly.ai/blog/synthetic-data-quality-assurance

In [None]:
# calculate whether the synthetic data respects the min/max bounds
# set by the real data
from sdmetrics.single_column import BoundaryAdherence

BoundaryAdherence.compute(
    tgt['glucose'],
    syn['glucose']
)

In [None]:
# calculate whether the synthetic data respects the min/max bounds
# set by the real data
from sdmetrics.single_column import BoundaryAdherence

BoundaryAdherence.compute(
    tgt['glucose'],
    syn_sdllm['glucose']
)

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [None]:
def quality_metrics(tgt, syn):
    no_of_records = min(tgt.shape[0] // 2, syn.shape[0], 10_000)
    tgt = tgt.sample(n=2 * no_of_records)
    trn = tgt.head(no_of_records)
    #print(trn.info())
    hol = tgt.tail(no_of_records)
    #print(hol.info())
    syn = syn.sample(n=no_of_records)
        
    string_cols = trn.select_dtypes(exclude=np.number).columns
    numeric_cols = trn.select_dtypes(include=np.number).columns
    transformer = make_column_transformer((SimpleImputer(missing_values=np.nan, strategy="mean"), numeric_cols),(OneHotEncoder(), string_cols),remainder="passthrough",)
    transformer.fit(pd.concat([trn, hol, syn], axis=0))
    trn_hot = transformer.transform(trn)
    hol_hot = transformer.transform(hol)
    syn_hot = transformer.transform(syn)


    # calculcate distances to nearest neighbors
    index = NearestNeighbors(n_neighbors=2, algorithm="brute", metric="l1", n_jobs=-1)
    index.fit(trn_hot)
    # k-nearest-neighbor search for both training and synthetic data, k=2 to calculate DCR + NNDR
    dcrs_hol, _ = index.kneighbors(hol_hot)
    dcrs_syn, _ = index.kneighbors(syn_hot)
    dcrs_hol = np.square(dcrs_hol)
    dcrs_syn = np.square(dcrs_syn)
    
    dcr_bound = np.maximum(np.quantile(dcrs_hol[:, 0], 0.95), 1e-8)
    ndcr_hol = dcrs_hol[:, 0] / dcr_bound
    ndcr_syn = dcrs_syn[:, 0] / dcr_bound
    
    return dcrs_hol, dcrs_syn, ndcr_hol, ndcr_syn 

### For DTA

In [None]:
dcrs_hol, dcrs_syn, ndcr_hol, ndcr_syn = quality_metrics(tgt, syn)

In [None]:
print(f"Normalized DCR 5-th percentile original  {np.percentile(ndcr_hol, 5):.3f}")
print(f"Normalized DCR 5-th percentile synthetic {np.percentile(ndcr_syn, 5):.3f}")

print(f"NNDR 5-th percentile original  {np.percentile(dcrs_hol[:,0]/dcrs_hol[:,1], 5):.3f}")
print(f"NNDR 5-th percentile synthetic {np.percentile(dcrs_syn[:,0]/dcrs_syn[:,1], 5):.3f}")

### For SD-LLM

In [None]:
dcrs_hol1, dcrs_syn1, ndcr_hol1, ndcr_syn1 = quality_metrics(tgt, syn_sdllm)

In [None]:
print(f"Normalized DCR 5-th percentile original_prediabetic  {np.percentile(ndcr_hol1, 5):.3f}")
print(f"Normalized DCR 5-th percentile synthetic_prediabetic {np.percentile(ndcr_syn1, 5):.3f}")

print(f"NNDR 5-th percentile original  {np.percentile(dcrs_hol1[:,0]/dcrs_hol1[:,1], 5):.3f}")
print(f"NNDR 5-th percentile synthetic {np.percentile(dcrs_syn1[:,0]/dcrs_syn1[:,1], 5):.3f}")

For both privacy metrics, the distance value for the synthetic dataset should be similar but not smaller. This gives us confidence that our synthetic record has not learned privacy-revealing information from the training data.