In [1]:
import pandas as pd
import numpy as np
from ctgan_adapter import CtganAdapter
from ctgan_benchmark import evaluate_ctgan, print_evaluation_results
from ctgan_utils import preprocess_data, get_tstr_results
import json

In [2]:
# Load configuration
print("# Load configuration")
with open("config.json", "r") as f:
    config = json.load(f)

# Load configuration


In [3]:
# 1. Load and prepare the dataset
print("# 1. Load and prepare the dataset")
data_raw = pd.read_csv("car.csv")
print(f"Columns in dataset: {data_raw.columns.tolist()}")
print(f"Dataset shape: {data_raw.shape}")
print(data_raw.head())

# 1. Load and prepare the dataset
Columns in dataset: ['Buying', 'Maint', 'Doors', 'Persons', 'Lug_boot', 'Safety', 'Class']
Dataset shape: (1728, 7)
  Buying  Maint Doors Persons Lug_boot Safety  Class
0  vhigh  vhigh     2       2    small    low  unacc
1  vhigh  vhigh     2       2    small    med  unacc
2  vhigh  vhigh     2       2    small   high  unacc
3  vhigh  vhigh     2       2      med    low  unacc
4  vhigh  vhigh     2       2      med    med  unacc


In [4]:
# 2. Preprocess data and detect categorical columns
print("\n# 2. Preprocess data and detect categorical columns")
data, categorical_columns = preprocess_data(data_raw)
print(f"Detected categorical columns: {categorical_columns}")

2025-03-30 16:14:56,024 - INFO - Converted Buying to category type (has 4 unique values)
2025-03-30 16:14:56,026 - INFO - Converted Maint to category type (has 4 unique values)
2025-03-30 16:14:56,029 - INFO - Converted Doors to category type (has 4 unique values)
2025-03-30 16:14:56,031 - INFO - Converted Persons to category type (has 3 unique values)
2025-03-30 16:14:56,033 - INFO - Converted Lug_boot to category type (has 3 unique values)
2025-03-30 16:14:56,034 - INFO - Converted Safety to category type (has 3 unique values)
2025-03-30 16:14:56,037 - INFO - Converted Class to category type (has 4 unique values)



# 2. Preprocess data and detect categorical columns
Detected categorical columns: ['Buying', 'Maint', 'Doors', 'Persons', 'Lug_boot', 'Safety', 'Class']


In [5]:
# 3. Define the target column for this dataset
print("\n# 3. Define the target column for this dataset")
target_column = "Class"
print(f"Target column: {target_column}")


# 3. Define the target column for this dataset
Target column: Class


In [6]:
# 4. Split the data into features and target
print("\n# 4. Split the data into features and target")
X = data.drop(columns=[target_column])
y = data[target_column]
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts()}")


# 4. Split the data into features and target
Features shape: (1728, 6)
Target shape: (1728,)
Target distribution:
Class
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64


In [7]:
# 5. Initialize and train CTGAN
print("\n# 5. Initialize and train CTGAN")
ctgan = CtganAdapter(**config["ctgan_params"])
print("Training CTGAN model...")
ctgan.fit(X, y)
print("Training completed")


# 5. Initialize and train CTGAN
Training CTGAN model...


Training Epochs:   0%|          | 1/300 [00:03<19:20,  3.88s/it]

Epoch 0, Loss D: 25.1800, Loss G: 0.8814


Training Epochs:  10%|█         | 31/300 [01:59<17:07,  3.82s/it]

Epoch 30, Loss D: 0.2164, Loss G: 0.0412


Training Epochs:  20%|██        | 61/300 [04:02<16:41,  4.19s/it]

Epoch 60, Loss D: 0.1760, Loss G: 0.0764


Training Epochs:  30%|███       | 91/300 [05:46<11:10,  3.21s/it]

Epoch 90, Loss D: 0.5814, Loss G: -0.0236


Training Epochs:  40%|████      | 121/300 [07:12<08:29,  2.84s/it]

Epoch 120, Loss D: 0.4691, Loss G: -0.8783


Training Epochs:  50%|█████     | 151/300 [08:37<07:05,  2.85s/it]

Epoch 150, Loss D: 0.3995, Loss G: -0.7371


Training Epochs:  60%|██████    | 181/300 [10:02<05:36,  2.83s/it]

Epoch 180, Loss D: 0.2164, Loss G: -0.7007


Training Epochs:  70%|███████   | 211/300 [11:27<04:09,  2.81s/it]

Epoch 210, Loss D: 0.2455, Loss G: -0.6760


Training Epochs:  80%|████████  | 241/300 [12:53<02:51,  2.91s/it]

Epoch 240, Loss D: 0.1894, Loss G: -0.6769


Training Epochs:  90%|█████████ | 271/300 [14:17<01:20,  2.79s/it]

Epoch 270, Loss D: 0.0132, Loss G: -0.5083


Training Epochs: 100%|██████████| 300/300 [15:40<00:00,  3.13s/it]

Training completed





In [8]:
# 6. Generate synthetic data
print("\n# 6. Generate synthetic data")
n_samples = 1000  
print(f"Generating {n_samples} synthetic samples...")
synthetic_data = ctgan.generate(n_samples)
print(f"Generated {len(synthetic_data)} synthetic samples")
print("Synthetic data head:")
print(synthetic_data.head())


# 6. Generate synthetic data
Generating 1000 synthetic samples...
Generated 1000 synthetic samples
Synthetic data head:
  Buying  Maint  Doors Persons Lug_boot Safety  Class
0   high    med  5more    more      med    med    acc
1    low   high      2       2      big    med  unacc
2  vhigh    low      2       2      big   high  unacc
3  vhigh  vhigh  5more       2      med   high    acc
4    low  vhigh      3       2      med    low  unacc


In [9]:
# 7. Evaluate quality using TSTR and other metrics
print("\n# 7. Evaluate quality using TSTR and other metrics")
print("Running evaluation...")
evaluation_results = evaluate_ctgan(data, synthetic_data, target_column=target_column)
print_evaluation_results(evaluation_results)


2025-03-30 16:44:34,542 - INFO - Encoded categorical target with mapping: {'acc': 0, 'good': 1, 'unacc': 2, 'vgood': 3}



# 7. Evaluate quality using TSTR and other metrics
Running evaluation...


Parameters: { "use_label_encoder" } are not used.

2025-03-30 16:44:37,108 - INFO - Encoded categorical targets with mapping: {'acc': 0, 'good': 1, 'unacc': 2, 'vgood': 3}
Parameters: { "use_label_encoder" } are not used.

2025-03-30 16:44:42,199 - INFO - CTGAN Evaluation Results:
2025-03-30 16:44:42,201 - INFO - 
Likelihood Fitness: Not applicable for fully categorical data
2025-03-30 16:44:42,201 - INFO - 
Statistical Similarity Metrics:
2025-03-30 16:44:42,202 - INFO -   - Jensen-Shannon Divergence Mean (Categorical): 0.0335
2025-03-30 16:44:42,202 - INFO - 
Machine Learning Efficacy Metrics on Real Data:
2025-03-30 16:44:42,203 - INFO -   LogisticRegression:
2025-03-30 16:44:42,203 - INFO -     - Accuracy: 0.9162
2025-03-30 16:44:42,204 - INFO -     - F1: 0.9163
2025-03-30 16:44:42,205 - INFO -   RandomForest:
2025-03-30 16:44:42,205 - INFO -     - Accuracy: 0.9682
2025-03-30 16:44:42,205 - INFO -     - F1: 0.9696
2025-03-30 16:44:42,205 - INFO -   MLP:
2025-03-30 16:44:42,207 - IN

In [10]:
# 8. Extract and display TSTR results specifically
print("\n# 8. TSTR Performance Results")
tstr_results = get_tstr_results(evaluation_results)
if tstr_results is not None:
    print(tstr_results)


# 8. TSTR Performance Results
                    Accuracy        F1
LogisticRegression  0.785301  0.741253
RandomForest        0.743056  0.707837
MLP                 0.721644  0.695015
XGBoost             0.729745  0.704093


In [11]:
# 9. Save the synthetic data
print("\n# 9. Save synthetic data")
output_path = "bank_loan_synthetic.csv"
synthetic_data.to_csv(output_path, index=False)
print(f"Synthetic data saved to {output_path}")

print("\nTest completed successfully!")


# 9. Save synthetic data
Synthetic data saved to bank_loan_synthetic.csv

Test completed successfully!
