In [151]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.single_table import CTGANSynthesizer
from sdv.evaluation.single_table import run_diagnostic
from sdv.evaluation.single_table import evaluate_quality


from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.ModelInspector import ModelInspector
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network

import numpy as np
import pandas as pd
from scipy import stats
import json
import os

In [152]:
file_name = 'Data_num.csv'
folder_path = "E:/Olha/SSCI disability/experiment/" # create folder "experiment" in the same directory as the script

real_data = pd.read_csv(file_name)

In [None]:
disability_mapping = {
    'Pain': 0,
    'Flexibility': 1,
    'Mobility': 2,
    'MentalHealth': 3,
    'Seeing': 4,
    'Hearing': 5,
    'Dexterity': 6,
    'Learning': 7,
    'Memory': 8,
    'Developmental': 9,
    'Unknown': 10
}

age_mapping = {
    '15_to_24': 0,
    '25_to_64': 1,
    '65_and_over': 2
}

gender_mapping = {
    'Men': 0,
    'Women': 1
}

barriers_mapping = {
    'Interaction_family': 0,
    'Interaction_healthcare': 1,
    'Interaction_services': 2
}

frequency_mapping = {
    'Never': 0,
    'Sometimes': 1,
    'Often': 2,
    'Always': 3
}

real_data['Disability'] = real_data['Disability'].map(disability_mapping)
real_data['Age'] = real_data['Age'].map(age_mapping)
real_data['Gender'] = real_data['Gender'].map(gender_mapping)
real_data['Barriers'] = real_data['Barriers'].map(barriers_mapping)
real_data['Frequency'] = real_data['Frequency'].map(frequency_mapping)

# Synthetic

In [154]:
real_data.to_csv('Data_num_encoded.csv', index=False)
real_data = pd.read_csv('Data_num_encoded.csv')
input_data = 'Data_num_encoded.csv'
input = pd.read_csv(input_data)
input.head()

categorical_attributes = {'Disability' : True, 'Age': True, 'Gender': True,'Barriers': True,'Frequency': True}
candidate_keys = {'ssn': True}

In [155]:
metadata = SingleTableMetadata()
metadata.detect_from_csv(filepath=file_name)
metadata.validate()
# metadata

In [None]:
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(real_data)
synthetic_data = synthesizer.sample(num_rows=5000)
synthetic_data.to_csv('experiment/synthetic_data_coupula.csv', index=False)

In [157]:
for epoch in [100, 300, 400, 500]:
    synthesizer2 = CTGANSynthesizer(metadata, epochs=500)
    synthesizer2.fit(real_data)
    synthetic_data2 = synthesizer2.sample(num_rows=5000)
    synthetic_data2.to_csv(f'./experiment/synthetic_data_CTGANSynthesizer_epoch{epoch}.csv', index=False)

In [None]:

for epsilon in [1, 5, 10]:
    for degree_of_bayesian_network in [2, 3]:

        description_file_1 = f'./experiment/description_correlated_attribute_mode_e{epsilon}_k{degree_of_bayesian_network}.json'
        synthetic_data_1 = f'./experiment/synthetic_data_correlated_attribute_mode_e{epsilon}_k{degree_of_bayesian_network}.csv'
        os.makedirs(os.path.dirname(description_file_1), exist_ok=True)
        os.makedirs(os.path.dirname(synthetic_data_1), exist_ok=True)

        mode = 'correlated_attribute_mode'

        threshold_value = 20

        num_tuples_to_generate = 5000


        describer = DataDescriber(category_threshold=threshold_value)
        describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data,
                                                                epsilon=epsilon,
                                                                k=degree_of_bayesian_network,
                                                                attribute_to_is_categorical=categorical_attributes,
                                                                attribute_to_is_candidate_key=candidate_keys)
        describer.save_dataset_description_to_file(description_file_1)

        generator = DataGenerator()
        generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file_1)
        generator.save_synthetic_data(synthetic_data_1)

        input_df = pd.read_csv(input_data, skipinitialspace=True)
        synthetic_df = pd.read_csv(synthetic_data_1)
        # Read attribute description from the dataset description file.
        attribute_description = read_json_file(description_file_1)['attribute_description']

        inspector = ModelInspector(input_df, synthetic_df, attribute_description)
        print(f"Completed for epsilon={epsilon}, k={degree_of_bayesian_network}")

Adding ROOT Barriers
Adding attribute Gender
Adding attribute Frequency
Adding attribute VALUE
Adding attribute Disability
Adding attribute Age
Completed for epsilon=1, k=2
Adding ROOT Barriers
Adding attribute Gender
Adding attribute Frequency
Adding attribute VALUE
Adding attribute Disability
Adding attribute Age
Completed for epsilon=1, k=3
Adding ROOT Barriers
Adding attribute Gender
Adding attribute VALUE
Adding attribute Frequency
Adding attribute Disability
Adding attribute Age
Completed for epsilon=5, k=2
Adding ROOT Barriers
Adding attribute Gender
Adding attribute VALUE
Adding attribute Frequency
Adding attribute Disability
Adding attribute Age
Completed for epsilon=5, k=3
Adding ROOT Barriers
Adding attribute Gender
Adding attribute VALUE
Adding attribute Frequency
Adding attribute Disability
Adding attribute Age
Completed for epsilon=10, k=2
Adding ROOT Barriers
Adding attribute Gender
Adding attribute VALUE
Adding attribute Frequency
Adding attribute Disability
Adding attr

In [159]:
display_bayesian_network(describer.bayesian_network)

Constructed Bayesian network:
    Gender     has parents ['Barriers'].
    VALUE      has parents ['Gender', 'Barriers'].
    Frequency  has parents ['Gender', 'VALUE', 'Barriers'].
    Disability has parents ['VALUE', 'Frequency', 'Gender'].
    Age        has parents ['Frequency', 'Disability', 'VALUE'].


In [None]:

for epsilon in [1, 5, 10]:
        # for degree_of_bayesian_network in [2, 3]:

        description_file_1 = f'./experiment/description_independent_attribute_mode_e{epsilon}.json'
        synthetic_data_1 = f'./experiment/description_independent_attribute_mode_e{epsilon}.csv'
        os.makedirs(os.path.dirname(description_file_1), exist_ok=True)
        os.makedirs(os.path.dirname(synthetic_data_1), exist_ok=True)

        mode = 'independent_attribute_mode'

        threshold_value = 20

        num_tuples_to_generate = 5000 

        describer = DataDescriber(category_threshold=threshold_value)
        describer.describe_dataset_in_independent_attribute_mode(dataset_file=input_data,
                                                                epsilon=epsilon,
                                                                attribute_to_is_categorical=categorical_attributes,
                                                                attribute_to_is_candidate_key=candidate_keys)
        describer.save_dataset_description_to_file(description_file_1)

        generator = DataGenerator()
        generator.generate_dataset_in_independent_mode(num_tuples_to_generate, description_file_1)
        generator.save_synthetic_data(synthetic_data_1)

        input_df = pd.read_csv(input_data, skipinitialspace=True)
        synthetic_df = pd.read_csv(synthetic_data_1)
        # Read attribute description from the dataset description file.
        attribute_description = read_json_file(description_file_1)['attribute_description']

        inspector = ModelInspector(input_df, synthetic_df, attribute_description)
        print(f"Completed for epsilon={epsilon}")

Completed for epsilon=1
Completed for epsilon=5
Completed for epsilon=10


In [None]:
import pandas as pd
import os

folder_path = "E:/Olha/SSCI disability/experiment/" 


dataframes = {}


for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path)
        dataframes[file_name[:-4]] = df # Remove the .csv extension from the key
print("Loaded DataFrames:", dataframes.keys())

dataframe_keys = list(dataframes.keys())
dataframes[dataframe_keys[1]].head()

file_name = 'Data_num.csv'
real_data = pd.read_csv(file_name)

import numpy as np
from scipy import stats

def kl_divergence(p, q):
    """Compute the KL divergence between two probability distributions."""
    p = np.array(p)
    q = np.array(q)
    return np.sum(p * np.log(p / q))

# Function to compute KL Divergence for a categorical column
def compute_kl_divergence(column_name,synthetic_data):
    original_counts = real_data[column_name].value_counts(normalize=True)
    synthetic_counts = synthetic_data[column_name].value_counts(normalize=True)

    all_categories = set(original_counts.index) | set(synthetic_counts.index)

    original_probs = original_counts.reindex(all_categories, fill_value=0)
    synthetic_probs = synthetic_counts.reindex(all_categories, fill_value=0)

    # Avoid division by zero and log of zero
    original_probs += 1e-10
    synthetic_probs += 1e-10

    kl_div = kl_divergence(original_probs, synthetic_probs)

    return kl_div

def convert_to_numeric_or_categorical(df, column):
   df[column] = df[column].astype('category').cat.codes  # Convert to categorical codes

def compare_categorical(column_original, column_synthetic, synthetic_data):
    contingency_table = pd.crosstab(real_data[column_original], synthetic_data[column_synthetic])

    # Perform Chi-Square Test
    chi2_stat, p_value, dof, expected = stats.chi2_contingency(contingency_table)
    return chi2_stat, p_value

from sdv.evaluation.single_table import run_diagnostic, evaluate_quality
from sdv.evaluation.single_table import get_column_plot
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_csv(filepath='Data_num_val.csv')
metadata.validate()



Loaded DataFrames: dict_keys(['description_independent_attribute_mode_e1', 'description_independent_attribute_mode_e10', 'description_independent_attribute_mode_e5', 'synthetic_data_correlated_attribute_mode_e10_k2', 'synthetic_data_correlated_attribute_mode_e10_k3', 'synthetic_data_correlated_attribute_mode_e1_k2', 'synthetic_data_correlated_attribute_mode_e1_k3', 'synthetic_data_correlated_attribute_mode_e5_k2', 'synthetic_data_correlated_attribute_mode_e5_k3', 'synthetic_data_coupula', 'synthetic_data_CTGANSynthesizer_epoch100', 'synthetic_data_CTGANSynthesizer_epoch300', 'synthetic_data_CTGANSynthesizer_epoch400', 'synthetic_data_CTGANSynthesizer_epoch500', 'synth_BN_new_v4_stand100k', 'synth_BN_standart'])


In [None]:
eval_data =[]

for df_name in dataframe_keys:
    df = dataframes[df_name] 
    print(f"Processing DataFrame: {df_name}")
    diagnostic = run_diagnostic(real_data.drop(columns=['VALUE']), df.drop(columns=['VALUE']), metadata)
    quality_report = evaluate_quality(real_data.drop(columns=['VALUE']), df.drop(columns=['VALUE']), metadata)
    print(df_name)

    # print(f'Data Validity {diagnostic.get_properties().Score[0]}')
    # print(f'Data Structure {diagnostic.get_properties().Score[1]}')
    # print(f'Column Shapes {quality_report.get_properties().Score[0]}')
    # print(f'Column Pair Trend {quality_report.get_properties().Score[1]}')

    eval_data.append({
            "Mode": df_name,
            "Data Validity": diagnostic.get_properties().Score[0],
            "Data Structure": diagnostic.get_properties().Score[1],
            "Column Shapes": quality_report.get_properties().Score[0],
            "Column Pair Trend": quality_report.get_properties().Score[1],

        })
    
evaluationSDK_df = pd.DataFrame(eval_data)
evaluationSDK_df = evaluationSDK_df.set_index('Mode')
ev_round = round(evaluationSDK_df, 4)
ev_round

Processing DataFrame: description_independent_attribute_mode_e1
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 5/5 [00:00<00:00, 835.72it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 501.23it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 5/5 [00:00<00:00, 713.29it/s]|
Column Shapes Score: 95.73%

(2/2) Evaluating Column Pair Trends: |██████████| 10/10 [00:00<00:00, 99.30it/s]|
Column Pair Trends Score: 92.26%

Overall Score (Average): 93.99%

description_independent_attribute_mode_e1
Processing DataFrame: description_independent_attribute_mode_e10
Generating report ...

(1/2) Evaluating Data Validity: |██████████| 5/5 [00:00<00:00, 832.20it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 1031.30it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%

Generating repo

Unnamed: 0_level_0,Data Validity,Data Structure,Column Shapes,Column Pair Trend
Mode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
description_independent_attribute_mode_e1,1.0,1.0,0.9573,0.9226
description_independent_attribute_mode_e10,1.0,1.0,0.9869,0.9746
description_independent_attribute_mode_e5,1.0,1.0,0.985,0.971
synthetic_data_correlated_attribute_mode_e10_k2,1.0,1.0,0.9799,0.9611
synthetic_data_correlated_attribute_mode_e10_k3,1.0,1.0,0.9854,0.9684
synthetic_data_correlated_attribute_mode_e1_k2,1.0,1.0,0.9437,0.8986
synthetic_data_correlated_attribute_mode_e1_k3,1.0,1.0,0.9593,0.9208
synthetic_data_correlated_attribute_mode_e5_k2,1.0,1.0,0.9751,0.9519
synthetic_data_correlated_attribute_mode_e5_k3,1.0,1.0,0.9831,0.9598
synthetic_data_coupula,1.0,1.0,0.989,0.976


In [None]:
results = []

for df_name in dataframe_keys:
    df = dataframes[df_name]  # Get the DataFrame for the current file

    for col in df.columns:
        # Chi-Square Test
        chi2_stat, p_value = compare_categorical(col, col, df)
        if chi2_stat == 0:
            decision = "Error"
        else:
            decision = "Same" if p_value > 0.05 else "Different" 

        #  Kullback-Leibler divergence
        kl = compute_kl_divergence(col, df)

        results.append({
            "Mode": df_name,
            "Column": col,
            "P-value": p_value,
            "Chi-Square": chi2_stat,
            "KL": kl,
            "Decision": decision
        })

results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Mode,Column,P-value,Chi-Square,KL,Decision
0,description_independent_attribute_mode_e1,Disability,0.465559,100.556771,0.084330,Same
1,description_independent_attribute_mode_e1,Age,0.490265,3.419234,0.000042,Same
2,description_independent_attribute_mode_e1,Gender,0.022641,5.195872,0.000200,Different
3,description_independent_attribute_mode_e1,Barriers,0.469967,3.552325,0.001027,Same
4,description_independent_attribute_mode_e1,Frequency,0.727830,6.120210,0.003340,Same
...,...,...,...,...,...,...
91,synth_BN_standart,Age,0.988610,0.318182,0.000000,Same
92,synth_BN_standart,Disability,0.957544,77.000000,0.000000,Same
93,synth_BN_standart,Barriers,1.000000,0.000000,0.000000,Error
94,synth_BN_standart,Frequency,0.136649,13.616162,0.000000,Same


In [195]:
results_df_2 = results_df[results_df['Column'] != 'VALUE']
kl_avg_by_mode = results_df_2.groupby('Mode')['KL'].mean()
kl_med_by_mode = results_df_2.groupby('Mode')['KL'].median()
# print(kl_avg_by_mode)
Chi_avg_by_mode = results_df_2.groupby('Mode')['Chi-Square'].mean()
Chi_med_by_mode = results_df_2.groupby('Mode')['Chi-Square'].median()
# print(Chi_avg_by_mode)

mean_diff_methods = pd.DataFrame({'KL_mean': kl_avg_by_mode, 'KL_median': kl_med_by_mode, 'Chi-Square_mean': Chi_avg_by_mode, 'Chi-Square_median': Chi_med_by_mode})
# mean_diff_methods.to_csv('experiment/mean_diff_methods.csv')

In [196]:
mean_diff_methods

Unnamed: 0_level_0,KL_mean,KL_median,Chi-Square_mean,Chi-Square_median
Mode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
description_independent_attribute_mode_e1,0.017788,0.001027,23.768882,5.195872
description_independent_attribute_mode_e10,0.000628,0.000313,22.785622,5.527635
description_independent_attribute_mode_e5,0.00108,0.000283,23.513966,5.527635
synth_BN_new_v4_stand100k,6e-06,3e-06,20.331423,2.256374
synth_BN_standart,0.0,0.0,18.236364,0.318182
synthetic_data_CTGANSynthesizer_epoch100,0.009357,0.007957,19.099636,4.205737
synthetic_data_CTGANSynthesizer_epoch300,0.007314,0.005786,20.238474,1.882238
synthetic_data_CTGANSynthesizer_epoch400,0.006451,0.005732,22.76523,7.964847
synthetic_data_CTGANSynthesizer_epoch500,0.01184,0.007125,20.424811,5.04101
synthetic_data_correlated_attribute_mode_e10_k2,0.001366,0.000849,23.486041,13.689818


In [197]:
# ev_round = ev_round.set_index('Mode')
together_val = pd.concat([mean_diff_methods, ev_round], axis=1)
together_val

Unnamed: 0_level_0,KL_mean,KL_median,Chi-Square_mean,Chi-Square_median,Data Validity,Data Structure,Column Shapes,Column Pair Trend
Mode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
description_independent_attribute_mode_e1,0.017788,0.001027,23.768882,5.195872,1.0,1.0,0.9573,0.9226
description_independent_attribute_mode_e10,0.000628,0.000313,22.785622,5.527635,1.0,1.0,0.9869,0.9746
description_independent_attribute_mode_e5,0.00108,0.000283,23.513966,5.527635,1.0,1.0,0.985,0.971
synth_BN_new_v4_stand100k,6e-06,3e-06,20.331423,2.256374,1.0,1.0,0.9986,0.9979
synth_BN_standart,0.0,0.0,18.236364,0.318182,1.0,1.0,1.0,1.0
synthetic_data_CTGANSynthesizer_epoch100,0.009357,0.007957,19.099636,4.205737,1.0,1.0,0.948,0.9057
synthetic_data_CTGANSynthesizer_epoch300,0.007314,0.005786,20.238474,1.882238,1.0,1.0,0.9506,0.9195
synthetic_data_CTGANSynthesizer_epoch400,0.006451,0.005732,22.76523,7.964847,1.0,1.0,0.9536,0.9236
synthetic_data_CTGANSynthesizer_epoch500,0.01184,0.007125,20.424811,5.04101,1.0,1.0,0.9386,0.9025
synthetic_data_correlated_attribute_mode_e10_k2,0.001366,0.000849,23.486041,13.689818,1.0,1.0,0.9799,0.9611


In [198]:
results_df.to_csv('experiment/final/all_together_results.csv', index=False)
ev_round.to_csv('experiment/final/sdv_evaluation score.csv', index=False)
mean_diff_methods.to_csv('experiment/final/mean_diff_methods.csv')

together_val.to_csv('experiment/final/FINAL_FINAL_ALL.csv')