In [None]:
import pandas as pd
import numpy as np
import dcor
from sklearn.preprocessing import LabelEncoder

def auto_detect_discrete_columns(df, max_unique_numeric=20):
    discrete_cols = []
    
    for col in df.columns:
        if pd.api.types.is_string_dtype(df[col]) or pd.api.types.is_bool_dtype(df[col]):
            discrete_cols.append(col)
            continue
            
        if pd.api.types.is_numeric_dtype(df[col]):
            unique_ratio = df[col].nunique() / len(df[col])
            
            if (df[col].nunique() <= max_unique_numeric) or (unique_ratio < 0.05):
                discrete_cols.append(col)
                
    return discrete_cols

def preprocess_data(df, max_unique_numeric=20):
    discrete_cols = auto_detect_discrete_columns(df, max_unique_numeric)
    
    le = LabelEncoder()
    for col in discrete_cols:
        df[col] = df[col].astype(str).fillna('MISSING')
        df[col] = le.fit_transform(df[col])
        
    return df

def compute_distance_correlation(data):

    n_features = data.shape[1]
    dcor_matrix = np.zeros((n_features, n_features))
    
    for i in range(n_features):
        for j in range(n_features):
            x = data.iloc[:, i].values
            y = data.iloc[:, j].values
            dcor_matrix[i, j] = dcor.distance_correlation(x, y)
            
    return dcor_matrix

def nonlinear_correlation_diff(real_path, synthetic_path, max_unique_numeric=20):

    real_df = pd.read_csv(real_path)
    syn_df = pd.read_csv(synthetic_path)

    assert set(real_df.columns) == set(syn_df.columns), "no"
    syn_df = syn_df[real_df.columns] 
    

    real_processed = preprocess_data(real_df.copy(), max_unique_numeric)
    syn_processed = preprocess_data(syn_df.copy(), max_unique_numeric)
    

    real_dcor = compute_distance_correlation(real_processed)
    syn_dcor = compute_distance_correlation(syn_processed)
    

    diff = np.linalg.norm(real_dcor - syn_dcor, 'fro')
    
    return {
        'discrete_columns': auto_detect_discrete_columns(real_df, max_unique_numeric),
        'distance_correlation_diff': diff,
        'real_dcor_matrix': real_dcor,
        'syn_dcor_matrix': syn_dcor
    }
#real_path = '../CTGAN-main/CTGAN-main/examples/csv/train_clean.csv'
#real_path = "../CTAB-GAN-main/Real_Datasets/CreditLong2.csv"
real_path = "../CTAB-GAN-main/Real_Datasets/Adult3.csv"
#real_path = "../synthcity-main/tutorials/covertype_preprocessed.csv"

#fake_path = "../synthcity-main/tutorials/newtrans_calc_pro_tit.csv"
#fake_path = "../synthcity-main/tutorials/newtrans_calc_pro_cre1.csv"
#fake_path = "../synthcity-main/tutorials/newtrans_calc_pro_adu.csv"
#fake_path = "../synthcity-main/tutorials/newtrans_calc_pro_cov.csv"

#fake_path = "../synthcity-main/tutorials/OriginalCTGAN-Tit_5.csv"
#fake_path = "../synthcity-main/tutorials/CTABGAN-Tit_5.csv"
#fake_path = "../synthcity-main/tutorials/TVAE-Tit_5.csv"
#fake_path = "../synthcity-main/tutorials/DDPM-Titanic_1.csv"

#fake_path = "../synthcity-main/tutorials/OriginalCTGAN-Credit_31.csv"
#fake_path = "../synthcity-main/tutorials/CTABGAN-Credit0_31.csv"
#fake_path = "../synthcity-main/tutorials/TVAE-Cre_1.csv"
#fake_path = "../synthcity-main/tutorials/DDPM-KL-credit200.csv"

#fake_path = "..合/synthcity-main/tutorials/OriginalCTGAN-Adu_5.csv"
#fake_path = "../synthcity-main/tutorials/CTABGAN-Adu_5.csv"
#fake_path = "../synthcity-main/tutorials/TVAE-Adu_5.csv"
#fake_path = "../synthcity-main/tutorials/DDPM-Adu_1.csv"

#fake_path = "../synthcity-main/tutorials/octgan-cover200.csv"
#fake_path = "../synthcity-main/tutorials/CTABGAN-cover200.csv"
#fake_path = "../synthcity-main/tutorials/TVAE-cover-5.csv"
#fake_path = "../synthcity-main/tutorials/DDPM-cover-5.csv"

#fake_path = "../synthcity-main/tutorials/mix_tit.csv"
#fake_path = "../synthcity-main/tutorials/cov-base+corr.csv"
#fake_path = "../synthcity-main/tutorials/cov-base+encoder.csv"

#fake_path = r"..\synthcity-main\tutorials\duanwen-trans-cre.csv"

# fake_path = "G:/DataSets/Fake_Dataset/TransCTGAN-finaladu200_5.csv"
# fake_path = "G:/DataSets/Fake_Dataset/TransCTGAN-finalchurn300.csv"
#real_path = "G:/DataSets/Churn.csv"
# real_path = "G:/DataSets/Adult.csv"
# fake_path = "G:/DataSets/Fake_Dataset/Synthcity-transctgan-adu200.csv"
# real_path = 'G:/DataSets/adult_processed_0.csv'
# fake_path = "G:/DataSets/Fake_Dataset/Synthcity-transctgan-newadu200.csv"
# real_path = 'G:/DataSets/Fake_Dataset/Credit150k.csv'
# #fake_path = "G:/DataSets/Fake_Dataset/CTGAN-Credit150k_50epochs.csv"
# fake_path = "G:/DataSets/Fake_Dataset/CTABGAN-Credit150k0.csv"
# fake_path = "G:/DataSets/Fake_Dataset/TransCTGAN-finalcover30k_1.csv"
#real_path = 'G:/DataSets/Credit10k.csv'
# real_path = '../CTGAN-main/CTGAN-main/examples/csv/train_clean.csv'
# fake_path = 'G:/DataSets/Fake_Dataset/Synthcity-CTABGANplus-tit200_0.csv'
#real_path = 'G:\DataSets\Covertype30k.csv'
fake_path = 'G:/DataSets/Fake_Dataset/2.csv'
#Synthcity-tvae-cover30k200e.csv
#CTABGAN-Cover30k{i}.csv
result = nonlinear_correlation_diff(
    real_path=real_path,
    synthetic_path=fake_path,
    max_unique_numeric=20  
)

print(f" {result['discrete_columns']}")
print(f" {result['distance_correlation_diff']:.4f}")