In [1]:
import pandas as pd
import numpy as np
from ctgan_adapter import CtganAdapter
from ctgan_benchmark import evaluate_ctgan, print_evaluation_results
from ctgan_utils import preprocess_data, get_tstr_results
import json

In [2]:
# Load configuration
print("# Load configuration")
with open("config.json", "r") as f:
    config = json.load(f)

# Load configuration


In [3]:
# 1. Load and prepare the dataset
print("# 1. Load and prepare the dataset")
data_raw = pd.read_csv("Satellite.csv")
print(f"Columns in dataset: {data_raw.columns.tolist()}")
print(f"Dataset shape: {data_raw.shape}")
print(data_raw.head())

# 1. Load and prepare the dataset
Columns in dataset: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36']
Dataset shape: (4435, 37)
    0    1    2   3   4    5    6   7   8    9  ...   27  28   29   30   31  \
0  92  115  120  94  84  102  106  79  84  102  ...  104  88  121  128  100   
1  84  102  106  79  84  102  102  83  80  102  ...  100  84  107  113   87   
2  84  102  102  83  80  102  102  79  84   94  ...   87  84   99  104   79   
3  80  102  102  79  84   94  102  79  80   94  ...   79  84   99  104   79   
4  84   94  102  79  80   94   98  76  80  102  ...   79  84  103  104   79   

   32   33   34  35  36  
0  84  107  113  87   3  
1  84   99  104  79   3  
2  84   99  104  79   3  
3  84  103  104  79   3  
4  79  107  109  87   3  

[5 rows x 37 columns]


In [4]:
# 2. Preprocess data and detect categorical columns
print("\n# 2. Preprocess data and detect categorical columns")
data, categorical_columns = preprocess_data(data_raw)
print(f"Detected categorical columns: {categorical_columns}")

2025-03-31 08:15:04,190 - INFO - Converted 36 to category type (has 6 unique values)



# 2. Preprocess data and detect categorical columns
Detected categorical columns: ['36']


In [5]:
# 3. Define the target column for this dataset
print("\n# 3. Define the target column for this dataset")
target_column = "36"
print(f"Target column: {target_column}")


# 3. Define the target column for this dataset
Target column: 36


In [6]:
# 4. Split the data into features and target
print("\n# 4. Split the data into features and target")
X = data.drop(columns=[target_column])
y = data[target_column]
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts()}")


# 4. Split the data into features and target
Features shape: (4435, 36)
Target shape: (4435,)
Target distribution:
36
1    1072
7    1038
3     961
2     479
5     470
4     415
Name: count, dtype: int64


In [7]:
# 5. Initialize and train CTGAN
print("\n# 5. Initialize and train CTGAN")
ctgan = CtganAdapter(**config["ctgan_params"])
print("Training CTGAN model...")
ctgan.fit(X, y)
print("Training completed")


# 5. Initialize and train CTGAN
Training CTGAN model...


Training Epochs:   0%|          | 1/300 [00:04<22:55,  4.60s/it]

Epoch 0, Loss D: 14.5796, Loss G: 1.7626


Training Epochs:  10%|█         | 31/300 [02:26<18:23,  4.10s/it]

Epoch 30, Loss D: -3.0556, Loss G: 0.2110


Training Epochs:  20%|██        | 61/300 [04:23<14:27,  3.63s/it]

Epoch 60, Loss D: -3.1801, Loss G: 0.2744


Training Epochs:  30%|███       | 91/300 [06:14<13:26,  3.86s/it]

Epoch 90, Loss D: -3.0514, Loss G: 0.4675


Training Epochs:  40%|████      | 121/300 [08:16<12:23,  4.16s/it]

Epoch 120, Loss D: -2.8914, Loss G: 0.4578


Training Epochs:  50%|█████     | 151/300 [10:16<09:54,  3.99s/it]

Epoch 150, Loss D: -3.0376, Loss G: 0.5589


Training Epochs:  60%|██████    | 181/300 [12:34<08:48,  4.44s/it]

Epoch 180, Loss D: -3.1152, Loss G: 0.5314


Training Epochs:  70%|███████   | 211/300 [15:02<07:12,  4.86s/it]

Epoch 210, Loss D: -3.0989, Loss G: 0.5294


Training Epochs:  80%|████████  | 241/300 [17:09<03:43,  3.79s/it]

Epoch 240, Loss D: -3.2875, Loss G: 0.5882


Training Epochs:  90%|█████████ | 271/300 [19:10<02:02,  4.24s/it]

Epoch 270, Loss D: -3.3223, Loss G: 0.5940


Training Epochs: 100%|██████████| 300/300 [21:09<00:00,  4.23s/it]

Training completed





In [8]:
# 6. Generate synthetic data
print("\n# 6. Generate synthetic data")
n_samples = 1000  
print(f"Generating {n_samples} synthetic samples...")
synthetic_data = ctgan.generate(n_samples)
print(f"Generated {len(synthetic_data)} synthetic samples")
print("Synthetic data head:")
print(synthetic_data.head())


# 6. Generate synthetic data
Generating 1000 synthetic samples...
Generated 1000 synthetic samples
Synthetic data head:
           0           1           2          3          4           5  \
0  87.599351  106.226241   93.951439  87.601895  90.259152  102.792962   
1  66.933657  101.096135   81.742887  67.589282  71.715229   76.969823   
2  70.537211   98.074047  101.257480  69.957174  78.393096   98.760673   
3  85.902818   99.890225  106.446538  91.134827  84.484883  108.429708   
4  84.246495  108.713866  112.921720  92.463245  88.559168  112.259095   

            6          7          8           9  ...         27         28  \
0  111.355641  73.947455  87.506111  106.489772  ...  81.386784  82.029284   
1   83.533367  75.510325  71.216631   68.885470  ...  77.272340  74.809416   
2  119.095875  92.040652  67.082076  100.924878  ...  85.120918  83.916986   
3   86.808461  90.163297  84.659857   90.518406  ...  82.661304  87.983531   
4  111.120134  84.693605  86.791615  110.947

In [9]:
# 7. Evaluate quality using TSTR and other metrics
print("\n# 7. Evaluate quality using TSTR and other metrics")
print("Running evaluation...")
evaluation_results = evaluate_ctgan(data, synthetic_data, target_column=target_column)
print_evaluation_results(evaluation_results)



# 7. Evaluate quality using TSTR and other metrics
Running evaluation...


2025-03-31 08:38:47,878 - INFO - Encoded categorical target with mapping: {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 7: 5}
Parameters: { "use_label_encoder" } are not used.

2025-03-31 08:39:03,442 - INFO - Encoded categorical targets with mapping: {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 7: 5}
Parameters: { "use_label_encoder" } are not used.

2025-03-31 08:39:06,777 - INFO - CTGAN Evaluation Results:
2025-03-31 08:39:06,779 - INFO - 
Likelihood Fitness Metrics:
2025-03-31 08:39:06,779 - INFO -   - Lsyn (Synthetic Data Log-Likelihood): -121.2791
2025-03-31 08:39:06,779 - INFO -   - Ltest (Real Data Log-Likelihood under Synthetic Model): -120.9488
2025-03-31 08:39:06,779 - INFO - 
Statistical Similarity Metrics:
2025-03-31 08:39:06,781 - INFO -   - Wasserstein Distance Mean (Numerical): 1.8686
2025-03-31 08:39:06,781 - INFO - 
Machine Learning Efficacy Metrics on Real Data:
2025-03-31 08:39:06,781 - INFO -   LogisticRegression:
2025-03-31 08:39:06,783 - INFO -     - Accuracy: 0.8591
2025-03-31 08:39:06,783 

In [10]:
# 8. Extract and display TSTR results specifically
print("\n# 8. TSTR Performance Results")
tstr_results = get_tstr_results(evaluation_results)
if tstr_results is not None:
    print(tstr_results)


# 8. TSTR Performance Results
                    Accuracy        F1
LogisticRegression  0.826381  0.820945
RandomForest        0.823901  0.823938
MLP                 0.823901  0.825714
XGBoost             0.822097  0.825244


In [11]:
# 9. Save the synthetic data
print("\n# 9. Save synthetic data")
output_path = "bank_loan_synthetic.csv"
synthetic_data.to_csv(output_path, index=False)
print(f"Synthetic data saved to {output_path}")

print("\nTest completed successfully!")


# 9. Save synthetic data
Synthetic data saved to bank_loan_synthetic.csv

Test completed successfully!
