### Import all the necessary libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.base import clone
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
def calculate_propensity_scores(
    real_data: pd.DataFrame,
    synthetic_data: pd.DataFrame,
    continuous_features=None,
    categorical_features=None,
    random_state: int = 42
) -> float:
    """
    Calculate propensity score 

    - Creates label internally
    - Checks feature match + ordering
    - Scales continuous features only
    - One-hot encodes categorical features (supports strings)
    - Computes the same score as your original function: mean |p - 0.5|
    """

    # ---- 0) check feature set and ordering ----
    real_cols = list(real_data.columns)
    syn_cols = list(synthetic_data.columns)

    if set(real_cols) != set(syn_cols):
        raise ValueError(
            f"Feature mismatch detected.\n"
            f"Missing in synthetic: {set(real_cols) - set(syn_cols)}\n"
            f"Missing in real: {set(syn_cols) - set(real_cols)}"
        )

    # reorder synthetic to match real
    synthetic_data = synthetic_data[real_cols]

    # ---- 1) decide feature groups (user-provided lists preferred) ----
    if continuous_features is None and categorical_features is None:
        # fallback inference
        continuous_features = real_data.select_dtypes(include=["number", "bool"]).columns.tolist()
        categorical_features = [c for c in real_cols if c not in continuous_features]
    else:
        continuous_features = list(continuous_features or [])
        categorical_features = list(categorical_features or [])

        provided = set(continuous_features) | set(categorical_features)
        missing = provided - set(real_cols)
        if missing:
            raise ValueError(f"Provided feature(s) not found in data: {missing}")

        overlap = set(continuous_features) & set(categorical_features)
        if overlap:
            raise ValueError(f"Features listed as both continuous and categorical: {overlap}")

        unassigned = set(real_cols) - provided
        if unassigned:
            raise ValueError(f"Some columns were not assigned to continuous/categorical: {unassigned}")


    # ---- 3) local copies ----
    real_data = real_data.copy()
    synthetic_data = synthetic_data.copy()

    # ---- 4) create internal label column ----
    label_column = "__source_label__"
    real_data[label_column] = 0
    synthetic_data[label_column] = 1

    # ---- 5) combine ----
    combined_data = pd.concat([real_data, synthetic_data], ignore_index=True)

    # ---- 6) split features / labels ----
    X = combined_data.drop(columns=[label_column])
    y = combined_data[label_column]

    # ---- 7) preprocess: scale continuous, one-hot categorical ----
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), continuous_features),
            ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=True), categorical_features),
        ],
        remainder="drop"
    )

    # ---- 8) fit model ----
    model = Pipeline(steps=[
        ("prep", preprocessor),
        ("lr", LogisticRegression(random_state=random_state))
    ])
    model.fit(X, y)

    # ---- 9) compute propensity score (unchanged from your version) ----
    probability = model.predict_proba(X)[:, 1]

    moving_sum = 0.0
    for p in probability:
        moving_sum += np.sqrt(np.square(p - 0.5))  # = abs(p - 0.5)

    propensity_score = moving_sum / probability.shape[0]
    return float(propensity_score)


In [3]:
real_data = pd.read_csv('Real_simulated_data/test_check_v3_imbalance.csv')

In [4]:
real_data.shape

(100, 15)

In [5]:
real_data.drop(columns=['patient_ids'], axis=1, inplace =True)

In [6]:
real_data

Unnamed: 0,age,gender,Feature_A,Feature_B,Feature_C,Feature_D,Feature_E,Feature_F,Feature_G,Feature_H,Feature_I,Feature_J,Feature_K,Feature_L
0,56,Male,Normal,class_1,Yes,Med_C,present,class_d,type_1,pain,A,X,positive,stage_2
1,69,Female,Low,class_0,No,Med_A,absent,class_b,type_0,no_pain,A,X,positive,stage_1
2,46,Male,Normal,class_1,Yes,Med_C,present,class_d,type_1,pain,A,X,positive,stage_2
3,32,Male,Normal,class_1,Yes,Med_C,present,class_d,type_1,pain,A,X,positive,stage_2
4,60,Male,Low,class_0,No,Med_B,absent,class_b,type_1,no_pain,A,X,negative,stage_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,46,Male,Normal,class_1,Yes,Med_C,present,class_c,type_1,pain,A,X,positive,stage_2
96,35,Male,High,class_2,Yes,Med_A,present,class_d,type_0,pain,A,X,positive,stage_2
97,43,Male,Low,class_0,No,Med_B,absent,class_a,type_1,no_pain,A,X,negative,stage_1
98,61,Male,Low,class_0,No,Med_A,absent,class_a,type_0,no_pain,A,X,positive,stage_1


In [7]:
#migraine
#cont_list = ['Age']
#cat_list = ['Frequency', 'Location', 'Character', 'Intensity', 'Hypoacusis','Vertigo', 'Tinnitus', 'Diplopia', 'Defect', 'DPF', 'Dysphasia']

#adult_data
#cat_list = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income', 'education-num']
#cont_list = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']

#AllDxs
#cont_list = ['A', 'B', 'C', 'D', 'E']
#cat_list = ['F', 'G']

#test_v0
cont_list = ['age']
cat_list = ['gender', 'Feature_A', 'Feature_B', 'Feature_C', 'Feature_D', 'Feature_E', 'Feature_F', 'Feature_G', 'Feature_H', 'Feature_I', 'Feature_J', 'Feature_K', 'Feature_L']

In [8]:
synthetic_data = pd.read_csv('Synthetic_data/ctabgan+/synthetic_ctabgan+_test_check_v3_imbalance.csv')

In [9]:
synthetic_data.duplicated().sum()

0

In [10]:
synthetic_data.shape

(100, 15)

In [11]:
synthetic_data.drop_duplicates(inplace=True)

In [12]:
synthetic_data

Unnamed: 0,patient_ids,age,gender,Feature_A,Feature_B,Feature_C,Feature_D,Feature_E,Feature_F,Feature_G,Feature_H,Feature_I,Feature_J,Feature_K,Feature_L
0,13,70,Male,High,class_2,Yes,Med_A,present,class_c,type_1,pain,A,X,positive,stage_2
1,9,70,Male,Normal,class_1,Yes,Med_A,present,class_c,type_1,pain,A,X,positive,stage_2
2,4,36,Male,Low,class_2,Yes,Med_C,present,class_c,type_0,pain,A,X,positive,stage_2
3,62,36,Male,Normal,class_1,No,Med_C,present,class_c,type_0,pain,A,X,positive,stage_2
4,52,29,Male,Normal,class_1,Yes,Med_C,present,class_d,type_1,pain,A,X,positive,stage_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1,67,Female,High,class_2,Yes,Med_C,present,class_c,type_0,pain,A,X,positive,stage_2
96,58,71,Male,High,class_1,Yes,Med_A,present,class_d,type_0,pain,A,X,positive,stage_2
97,58,65,Female,Normal,class_1,Yes,Med_C,present,class_d,type_1,pain,A,X,positive,stage_2
98,17,39,Male,Low,class_2,Yes,Med_A,present,class_d,type_2,pain,B,X,positive,stage_2


In [13]:
synthetic_data.drop(columns=['patient_ids'], axis=1, inplace =True)

In [14]:
synthetic_data[cont_list] = (
    synthetic_data[cont_list]
    .round()
    .astype(int)
)

In [15]:
synthetic_data_HFGF = pd.read_csv('Synthetic_data/ctabgan+/synthetic_ctabgan+_test_check_v3_imbalance_dependent_mapping.csv')

In [16]:
synthetic_data_HFGF.duplicated().sum()

0

In [17]:
synthetic_data_HFGF.shape

(100, 15)

In [18]:
synthetic_data_HFGF

Unnamed: 0,patient_ids,age,gender,Feature_A,Feature_B,Feature_C,Feature_D,Feature_E,Feature_F,Feature_G,Feature_H,Feature_I,Feature_J,Feature_K,Feature_L
0,19,75,Male,Normal,class_1,Yes,Med_C,present,class_c,type_1,pain,A,X,positive,stage_2
1,61,32,Male,Low,class_0,No,Med_A,absent,class_b,type_0,no_pain,A,X,positive,stage_1
2,29,43,Female,Low,class_0,No,Med_A,absent,class_b,type_0,no_pain,A,X,positive,stage_1
3,69,23,Male,High,class_2,Yes,Med_D,present,class_d,type_2,pain,B,Y,negative,stage_2
4,64,29,Male,High,class_2,Yes,Med_A,present,class_d,type_0,pain,A,X,positive,stage_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,82,23,Male,High,class_2,Yes,Med_D,present,class_c,type_2,pain,B,Y,negative,stage_2
96,73,42,Male,Normal,class_1,Yes,Med_E,present,class_c,type_2,pain,B,Y,positive,stage_2
97,29,45,Male,Low,class_0,No,Med_A,absent,class_b,type_0,no_pain,A,X,positive,stage_1
98,21,27,Male,High,class_2,Yes,Med_A,present,class_d,type_0,pain,A,X,positive,stage_2


In [19]:
synthetic_data_HFGF.drop(columns=['patient_ids'], axis=1, inplace =True)

In [20]:
synthetic_data_HFGF[cont_list] = (
    synthetic_data_HFGF[cont_list]
    .round()
    .astype(int)
)

In [21]:
calculate_propensity_scores(real_data, synthetic_data, continuous_features = cont_list, categorical_features = cat_list, random_state=42)

0.29589263229689355

In [22]:
calculate_propensity_scores(real_data, synthetic_data_HFGF, continuous_features = cont_list, categorical_features = cat_list, random_state=42)

0.059373790672065684