In [13]:
import pandas as pd
import numpy as np
from ctgan_adapter import CtganAdapter
from ctgan_benchmark import evaluate_ctgan, print_evaluation_results
from ctgan_utils import preprocess_data, get_tstr_results
import json

In [14]:
# Load configuration
print("# Load configuration")
with open("config.json", "r") as f:
    config = json.load(f)

# Load configuration


In [15]:
# 1. Load and prepare the dataset
print("# 1. Load and prepare the dataset")
data_raw = pd.read_csv("Bank_Personal_Loan.csv")
print(f"Columns in dataset: {data_raw.columns.tolist()}")
print(f"Dataset shape: {data_raw.shape}")
print(data_raw.head())

# 1. Load and prepare the dataset
Columns in dataset: ['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg', 'Education', 'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account', 'Online', 'CreditCard']
Dataset shape: (5000, 14)
   ID  Age  Experience  Income  ZIP Code  Family CCAvg  Education  Mortgage  \
0   1   25           1      49     91107       4  1/60          1         0   
1   2   45          19      34     90089       3  1/50          1         0   
2   3   39          15      11     94720       1  1/00          1         0   
3   4   35           9     100     94112       1  2/70          2         0   
4   5   35           8      45     91330       4  1/00          2         0   

   Personal Loan  Securities Account  CD Account  Online  CreditCard  
0              0                   1           0       0           0  
1              0                   1           0       0           0  
2              0                   0           0       0     

In [16]:
# 2. Preprocess data and detect categorical columns
print("\n# 2. Preprocess data and detect categorical columns")
data, categorical_columns = preprocess_data(data_raw)
print(f"Detected categorical columns: {categorical_columns}")

2025-03-31 10:52:45,610 - INFO - Converted Family to category type (has 4 unique values)
2025-03-31 10:52:45,612 - INFO - Converted CCAvg to category type (has 108 unique values)
2025-03-31 10:52:45,615 - INFO - Converted Education to category type (has 3 unique values)
2025-03-31 10:52:45,616 - INFO - Converted Personal Loan to category type (has 2 unique values)
2025-03-31 10:52:45,619 - INFO - Converted Securities Account to category type (has 2 unique values)
2025-03-31 10:52:45,621 - INFO - Converted CD Account to category type (has 2 unique values)
2025-03-31 10:52:45,623 - INFO - Converted Online to category type (has 2 unique values)
2025-03-31 10:52:45,624 - INFO - Converted CreditCard to category type (has 2 unique values)



# 2. Preprocess data and detect categorical columns
Detected categorical columns: ['Family', 'CCAvg', 'Education', 'Personal Loan', 'Securities Account', 'CD Account', 'Online', 'CreditCard']


In [17]:
# 3. Define the target column for this dataset
print("\n# 3. Define the target column for this dataset")
target_column = "Personal Loan"
print(f"Target column: {target_column}")


# 3. Define the target column for this dataset
Target column: Personal Loan


In [18]:
# 4. Split the data into features and target
print("\n# 4. Split the data into features and target")
X = data.drop(columns=[target_column])
y = data[target_column]
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts()}")


# 4. Split the data into features and target
Features shape: (5000, 13)
Target shape: (5000,)
Target distribution:
Personal Loan
0    4520
1     480
Name: count, dtype: int64


In [19]:
# 5. Initialize and train CTGAN
print("\n# 5. Initialize and train CTGAN")
ctgan = CtganAdapter(**config["ctgan_params"])
print("Training CTGAN model...")
ctgan.fit(X, y)
print("Training completed")


# 5. Initialize and train CTGAN
Training CTGAN model...


Training Epochs:   0%|          | 1/300 [00:14<1:13:25, 14.73s/it]

Epoch 0, Loss D: 5.4190, Loss G: 0.7946


Training Epochs:  10%|█         | 31/300 [10:13<1:44:49, 23.38s/it]

Epoch 30, Loss D: -0.8237, Loss G: 0.5491


Training Epochs:  20%|██        | 61/300 [19:04<1:07:42, 17.00s/it]

Epoch 60, Loss D: -1.0479, Loss G: 0.6288


Training Epochs:  30%|███       | 91/300 [27:32<1:00:48, 17.46s/it]

Epoch 90, Loss D: -1.8690, Loss G: 0.6165


Training Epochs:  40%|████      | 121/300 [35:57<49:09, 16.48s/it] 

Epoch 120, Loss D: -2.6941, Loss G: 0.5433


Training Epochs:  50%|█████     | 151/300 [44:12<41:37, 16.76s/it]

Epoch 150, Loss D: -3.0312, Loss G: 0.6166


Training Epochs:  60%|██████    | 181/300 [52:40<32:39, 16.47s/it]

Epoch 180, Loss D: -3.1334, Loss G: 0.7197


Training Epochs:  70%|███████   | 211/300 [1:00:54<24:37, 16.60s/it]

Epoch 210, Loss D: -3.1584, Loss G: 0.7333


Training Epochs:  80%|████████  | 241/300 [1:09:24<17:17, 17.59s/it]

Epoch 240, Loss D: -3.2288, Loss G: 0.8066


Training Epochs:  90%|█████████ | 271/300 [1:17:43<07:59, 16.53s/it]

Epoch 270, Loss D: -3.2557, Loss G: 0.7479


Training Epochs: 100%|██████████| 300/300 [1:25:36<00:00, 17.12s/it]

Training completed





In [20]:
# 6. Generate synthetic data
print("\n# 6. Generate synthetic data")
n_samples = 1000  
print(f"Generating {n_samples} synthetic samples...")
synthetic_data = ctgan.generate(n_samples)
print(f"Generated {len(synthetic_data)} synthetic samples")
print("Synthetic data head:")
print(synthetic_data.head())


# 6. Generate synthetic data
Generating 1000 synthetic samples...
Generated 1000 synthetic samples
Synthetic data head:
            ID        Age  Experience      Income      ZIP Code  Family CCAvg  \
0  3492.829902  62.275067   33.431910   42.808217  92546.522657       2  2/40   
1   964.270717  50.499072   24.584849   60.338973  94607.713893       3  1/50   
2  4573.572756  30.611324    9.518605   46.278774  94494.380846       3  2/00   
3  4123.304121  59.483044   37.677594   49.271787  91364.496361       1  2/40   
4  1737.908190  63.278035   37.637363  190.947986  95418.366095       2  0/30   

   Education    Mortgage  Securities Account  CD Account  Online  CreditCard  \
0          2  172.025069                   0           0       1           0   
1          2    0.000000                   0           0       0           0   
2          2    0.088067                   0           0       0           0   
3          2  129.110736                   0           0       0        

In [21]:
# 7. Evaluate quality using TSTR and other metrics
print("\n# 7. Evaluate quality using TSTR and other metrics")
print("Running evaluation...")
evaluation_results = evaluate_ctgan(data, synthetic_data, target_column=target_column)
print_evaluation_results(evaluation_results)


2025-03-31 13:08:28,454 - INFO - Encoded categorical target with mapping: {0: 0, 1: 1}



# 7. Evaluate quality using TSTR and other metrics
Running evaluation...


Parameters: { "use_label_encoder" } are not used.

2025-03-31 13:08:38,641 - INFO - Encoded categorical targets with mapping: {0: 0, 1: 1}
Parameters: { "use_label_encoder" } are not used.

2025-03-31 13:08:44,068 - INFO - CTGAN Evaluation Results:
2025-03-31 13:08:44,070 - INFO - 
Likelihood Fitness Metrics:
2025-03-31 13:08:44,072 - INFO -   - Lsyn (Synthetic Data Log-Likelihood): -28.7804
2025-03-31 13:08:44,073 - INFO -   - Ltest (Real Data Log-Likelihood under Synthetic Model): -30.7562
2025-03-31 13:08:44,074 - INFO - 
Statistical Similarity Metrics:
2025-03-31 13:08:44,075 - INFO -   - Jensen-Shannon Divergence Mean (Categorical): 0.0872
2025-03-31 13:08:44,076 - INFO -   - Wasserstein Distance Mean (Numerical): 61.0105
2025-03-31 13:08:44,077 - INFO - 
Machine Learning Efficacy Metrics on Real Data:
2025-03-31 13:08:44,078 - INFO -   LogisticRegression:
2025-03-31 13:08:44,079 - INFO -     - Accuracy: 0.9750
2025-03-31 13:08:44,079 - INFO -     - F1: 0.9741
2025-03-31 13:08:44,

In [22]:
# 8. Extract and display TSTR results specifically
print("\n# 8. TSTR Performance Results")
tstr_results = get_tstr_results(evaluation_results)
if tstr_results is not None:
    print(tstr_results)


# 8. TSTR Performance Results
                    Accuracy        F1
LogisticRegression    0.9096  0.885126
RandomForest          0.9036  0.860495
MLP                   0.8932  0.870642
XGBoost               0.9008  0.857570


In [23]:
# 9. Save the synthetic data
print("\n# 9. Save synthetic data")
output_path = "Loan_synthetic.csv"
synthetic_data.to_csv(output_path, index=False)
print(f"Synthetic data saved to {output_path}")

print("\nTest completed successfully!")


# 9. Save synthetic data
Synthetic data saved to Loan_synthetic.csv

Test completed successfully!
