In [2]:
import pandas as pd
import numpy as np
import json
import torch

# Direct imports from the same directory
from tabddpm_adapter import TabDDPMAdapter
from tabddpm_benchmark import evaluate_tabddpm, print_evaluation_results
from tabddpm_utils import preprocess_data, get_tstr_results

In [3]:
# Load configuration
print("# Load configuration")
with open("config.json", "r") as f:
    config = json.load(f)

# Load configuration


In [4]:
# 1. Load and prepare the dataset
print("# 1. Load and prepare the dataset")
data_raw = pd.read_csv("Bank_Personal_Loan.csv")
print(f"Columns in dataset: {data_raw.columns.tolist()}")
print(f"Dataset shape: {data_raw.shape}")
print(data_raw.head())

# 1. Load and prepare the dataset
Columns in dataset: ['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg', 'Education', 'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account', 'Online', 'CreditCard']
Dataset shape: (5000, 14)
   ID  Age  Experience  Income  ZIP Code  Family CCAvg  Education  Mortgage  \
0   1   25           1      49     91107       4  1/60          1         0   
1   2   45          19      34     90089       3  1/50          1         0   
2   3   39          15      11     94720       1  1/00          1         0   
3   4   35           9     100     94112       1  2/70          2         0   
4   5   35           8      45     91330       4  1/00          2         0   

   Personal Loan  Securities Account  CD Account  Online  CreditCard  
0              0                   1           0       0           0  
1              0                   1           0       0           0  
2              0                   0           0       0     

In [5]:
# 2. Preprocess data and detect categorical columns
print("\n# 2. Preprocess data and detect categorical columns")
data, categorical_columns = preprocess_data(data_raw)
print(f"Detected categorical columns: {categorical_columns}")

2025-04-27 11:37:31,849 - INFO - Converted Family to category type (has 4 unique values)
2025-04-27 11:37:31,849 - INFO - Converted CCAvg to category type (has 108 unique values)
2025-04-27 11:37:31,855 - INFO - Converted Education to category type (has 3 unique values)
2025-04-27 11:37:31,856 - INFO - Converted Personal Loan to category type (has 2 unique values)
2025-04-27 11:37:31,857 - INFO - Converted Securities Account to category type (has 2 unique values)
2025-04-27 11:37:31,859 - INFO - Converted CD Account to category type (has 2 unique values)
2025-04-27 11:37:31,860 - INFO - Converted Online to category type (has 2 unique values)
2025-04-27 11:37:31,861 - INFO - Converted CreditCard to category type (has 2 unique values)



# 2. Preprocess data and detect categorical columns
Detected categorical columns: ['Family', 'CCAvg', 'Education', 'Personal Loan', 'Securities Account', 'CD Account', 'Online', 'CreditCard']


In [6]:
# 3. Define the target column for this dataset
print("\n# 3. Define the target column for this dataset")
target_column = "Personal Loan"
print(f"Target column: {target_column}")


# 3. Define the target column for this dataset
Target column: Personal Loan


In [7]:
# 4. Split the data into features and target
print("\n# 4. Split the data into features and target")
X = data.drop(columns=[target_column])
y = data[target_column]
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts()}")


# 4. Split the data into features and target
Features shape: (5000, 13)
Target shape: (5000,)
Target distribution:
Personal Loan
0    4520
1     480
Name: count, dtype: int64


In [8]:
# 5. Initialize and train TabDDPM
print("\n# 5. Initialize and train TabDDPM")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tabddpm = TabDDPMAdapter(**config["tabddpm_params"], device=device)
print("Training TabDDPM model...")
tabddpm.fit(X, y)
print("Training completed")


# 5. Initialize and train TabDDPM
Training TabDDPM model...
Original data shape: (5000, 14), Target column: Personal Loan
Added StandardScaler for 6 numerical columns
Added OneHotEncoder for 7 categorical columns
Target 'Personal Loan' identified as categorical with 2 classes
Class mapping: {0: 0, 1: 1}

Preprocessing Summary:
- Number of numerical features: 6
- Number of categorical features: 7
- Categorical columns: ['Family', 'CCAvg', 'Education', 'Securities Account', 'CD Account', 'Online', 'CreditCard']
- Target column: Personal Loan
- Target type: Categorical
- Number of target classes: 2
X shape: (5000, 13), y shape: (5000,)
X_tensor shape: torch.Size([5000, 129]), y_tensor shape: torch.Size([5000])
X_tensor shape: torch.Size([5000, 129])
y_tensor shape: torch.Size([5000])


Epoch 299, Loss: 0.7155: 100%|██████████| 300/300 [05:15<00:00,  1.05s/it]

Training completed





In [9]:
# 6. Generate synthetic data
print("\n# 6. Generate synthetic data")
n_samples = 1000  
print(f"Generating {n_samples} synthetic samples...")
synthetic_data = tabddpm.generate(n_samples)
print(f"Generated {len(synthetic_data)} synthetic samples")
print("Synthetic data head:")
print(synthetic_data.head())


# 6. Generate synthetic data
Generating 1000 synthetic samples...
Sample timestep    0
Sample timestep    0
Sample timestep    0
Sample timestep    0
Debug - out_dict type: <class 'torch.Tensor'>
Added target column 'Personal Loan' with 2 unique values
Final columns in synthetic data: ['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg', 'Education', 'Mortgage', 'Securities Account', 'CD Account', 'Online', 'CreditCard', 'Personal Loan']
Generated 1000 synthetic samples
Synthetic data head:
            ID        Age  Experience      Income       ZIP Code  Family  \
0  5499.900000  18.600000   -7.600000  245.600000  105385.400000       2   
1  5499.900000  18.600000   -7.600000  245.600000  105385.400000       4   
2  4504.194941  31.573985    6.991497  245.600000  103137.771163       4   
3  3366.625321  60.249901   35.976439   63.533183   95312.209003       1   
4  5499.900000  18.600000   -7.600000  245.600000     572.600000       2   

  CCAvg  Education    Mortgage 

In [10]:
# 7. Evaluate quality using TSTR and other metrics
print("\n# 7. Evaluate quality using TSTR and other metrics")
print("Running evaluation...")
evaluation_results = evaluate_tabddpm(data, synthetic_data, target_column=target_column)
print_evaluation_results(evaluation_results)



# 7. Evaluate quality using TSTR and other metrics
Running evaluation...


2025-04-27 11:44:14,720 - INFO - Encoded categorical target with mapping: {0: 0, 1: 1}
Parameters: { "use_label_encoder" } are not used.

2025-04-27 11:44:23,704 - INFO - Encoded categorical targets with mapping: {0: 0, 1: 1}
Parameters: { "use_label_encoder" } are not used.

2025-04-27 11:44:32,681 - INFO - TabDDPM Evaluation Results:
2025-04-27 11:44:32,683 - INFO - 
Likelihood Fitness Metrics:
2025-04-27 11:44:32,684 - INFO -   - Lsyn (Synthetic Data Log-Likelihood): 22.8104
2025-04-27 11:44:32,684 - INFO -   - Ltest (Real Data Log-Likelihood under Synthetic Model): -39.5063
2025-04-27 11:44:32,685 - INFO - 
Statistical Similarity Metrics:
2025-04-27 11:44:32,686 - INFO -   - Jensen-Shannon Divergence Mean (Categorical): 0.2030
2025-04-27 11:44:32,686 - INFO -   - Wasserstein Distance Mean (Numerical): 3436.5914
2025-04-27 11:44:32,687 - INFO - 
Machine Learning Efficacy Metrics on Real Data:
2025-04-27 11:44:32,688 - INFO -   LogisticRegression:
2025-04-27 11:44:32,689 - INFO -    

In [11]:
# 8. Extract and display TSTR results specifically
print("\n# 8. TSTR Performance Results")
tstr_results = get_tstr_results(evaluation_results)
if tstr_results is not None:
    print(tstr_results)


# 8. TSTR Performance Results
                    Accuracy        F1
LogisticRegression    0.7592  0.797765
RandomForest          0.7706  0.802595
MLP                   0.6070  0.687718
XGBoost               0.7010  0.758006


In [12]:
# 9. Save the synthetic data
print("\n# 9. Save synthetic data")
output_path = "Loan_synthetic.csv"
synthetic_data.to_csv(output_path, index=False)
print(f"Synthetic data saved to {output_path}")

print("\nTest completed successfully!")


# 9. Save synthetic data
Synthetic data saved to Loan_synthetic.csv

Test completed successfully!
