In [1]:
import pandas as pd
import numpy as np
from ctgan_adapter import CtganAdapter
from ctgan_benchmark import evaluate_ctgan, print_evaluation_results
from ctgan_utils import preprocess_data, get_tstr_results
import json

In [2]:
# Load configuration
print("# Load configuration")
with open("config.json", "r") as f:
    config = json.load(f)

# Load configuration


In [3]:
# 1. Load and prepare the dataset
print("# 1. Load and prepare the dataset")
data_raw = pd.read_csv("letter-recognition.csv")
print(f"Columns in dataset: {data_raw.columns.tolist()}")
print(f"Dataset shape: {data_raw.shape}")
print(data_raw.head())

# 1. Load and prepare the dataset
Columns in dataset: ['letter', 'xbox ', 'ybox ', 'width ', 'height', 'onpix ', 'xbar ', 'ybar ', 'x2bar', 'y2bar ', 'xybar ', 'x2ybar', 'xy2bar', 'xedge ', 'xedgey', 'yedge ', 'yedgex']
Dataset shape: (20000, 17)
  letter  xbox   ybox   width   height  onpix   xbar   ybar   x2bar  y2bar   \
0      T      2      8       3       5       1      8     13      0       6   
1      I      5     12       3       7       2     10      5      5       4   
2      D      4     11       6       8       6     10      6      2       6   
3      N      7     11       6       6       3      5      9      4       6   
4      G      2      1       3       1       1      8      6      6       6   

   xybar   x2ybar  xy2bar  xedge   xedgey  yedge   yedgex  
0       6      10       8       0       8       0       8  
1      13       3       9       2       8       4      10  
2      10       3       7       3       7       3       9  
3       4       4      10       6     

In [4]:
# 2. Preprocess data and detect categorical columns
print("\n# 2. Preprocess data and detect categorical columns")
data, categorical_columns = preprocess_data(data_raw)
print(f"Detected categorical columns: {categorical_columns}")

2025-04-02 14:51:54,728 - INFO - Converted letter to category type (has 26 unique values)
2025-04-02 14:51:54,730 - INFO - Converted xbox  to category type (has 16 unique values)
2025-04-02 14:51:54,733 - INFO - Converted ybox  to category type (has 16 unique values)
2025-04-02 14:51:54,736 - INFO - Converted width  to category type (has 16 unique values)
2025-04-02 14:51:54,737 - INFO - Converted height to category type (has 16 unique values)
2025-04-02 14:51:54,739 - INFO - Converted onpix  to category type (has 16 unique values)
2025-04-02 14:51:54,740 - INFO - Converted xbar  to category type (has 16 unique values)
2025-04-02 14:51:54,742 - INFO - Converted ybar  to category type (has 16 unique values)
2025-04-02 14:51:54,743 - INFO - Converted x2bar to category type (has 16 unique values)
2025-04-02 14:51:54,744 - INFO - Converted y2bar  to category type (has 16 unique values)
2025-04-02 14:51:54,744 - INFO - Converted xybar  to category type (has 16 unique values)
2025-04-02 14:5


# 2. Preprocess data and detect categorical columns
Detected categorical columns: ['letter', 'xbox ', 'ybox ', 'width ', 'height', 'onpix ', 'xbar ', 'ybar ', 'x2bar', 'y2bar ', 'xybar ', 'x2ybar', 'xy2bar', 'xedge ', 'xedgey', 'yedge ', 'yedgex']


In [5]:
# 3. Define the target column for this dataset
print("\n# 3. Define the target column for this dataset")
target_column = "letter"
print(f"Target column: {target_column}")


# 3. Define the target column for this dataset
Target column: letter


In [6]:
# 4. Split the data into features and target
print("\n# 4. Split the data into features and target")
X = data.drop(columns=[target_column])
y = data[target_column]
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Target distribution:\n{y.value_counts()}")


# 4. Split the data into features and target
Features shape: (20000, 16)
Target shape: (20000,)
Target distribution:
letter
U    813
D    805
P    803
T    796
M    792
A    789
X    787
Y    786
Q    783
N    783
F    775
G    773
E    768
B    766
V    764
L    761
R    758
I    755
O    753
W    752
S    748
J    747
K    739
C    736
H    734
Z    734
Name: count, dtype: int64


In [8]:
# 5. Initialize and train CTGAN
print("\n# 5. Initialize and train CTGAN")
ctgan = CtganAdapter(**config["ctgan_params"])
print("Training CTGAN model...")
ctgan.fit(X, y)
print("Training completed")


# 5. Initialize and train CTGAN
Training CTGAN model...


Training Epochs:   0%|          | 1/300 [01:40<8:18:45, 100.08s/it]

Epoch 0, Loss D: 14.4444, Loss G: 4.6455


Training Epochs:  10%|█         | 31/300 [1:11:01<10:32:43, 141.13s/it]

Epoch 30, Loss D: -1.5041, Loss G: 0.4148


Training Epochs:  20%|██        | 61/300 [2:14:52<7:27:24, 112.32s/it] 

Epoch 60, Loss D: -3.6350, Loss G: 0.6079


Training Epochs:  30%|███       | 91/300 [2:55:04<4:28:55, 77.20s/it] 

Epoch 90, Loss D: -4.2757, Loss G: 0.4690


Training Epochs:  40%|████      | 121/300 [3:33:16<3:47:31, 76.27s/it]

Epoch 120, Loss D: -4.1019, Loss G: 0.4729


Training Epochs:  50%|█████     | 151/300 [4:12:08<3:08:19, 75.84s/it]

Epoch 150, Loss D: -4.0280, Loss G: 0.4807


Training Epochs:  60%|██████    | 181/300 [4:50:22<2:31:05, 76.18s/it]

Epoch 180, Loss D: -4.0253, Loss G: 0.4543


Training Epochs:  70%|███████   | 211/300 [5:28:34<1:53:06, 76.26s/it]

Epoch 210, Loss D: -4.0554, Loss G: 0.4625


Training Epochs:  80%|████████  | 241/300 [6:06:35<1:14:25, 75.68s/it]

Epoch 240, Loss D: -4.0454, Loss G: 0.4231


Training Epochs:  90%|█████████ | 271/300 [6:49:07<51:17, 106.13s/it] 

Epoch 270, Loss D: -4.0405, Loss G: 0.3897


Training Epochs: 100%|██████████| 300/300 [7:44:52<00:00, 92.97s/it] 

Training completed





In [9]:
# 6. Generate synthetic data
print("\n# 6. Generate synthetic data")
n_samples = 1000  
print(f"Generating {n_samples} synthetic samples...")
synthetic_data = ctgan.generate(n_samples)
print(f"Generated {len(synthetic_data)} synthetic samples")
print("Synthetic data head:")
print(synthetic_data.head())


# 6. Generate synthetic data
Generating 1000 synthetic samples...
Generated 1000 synthetic samples
Synthetic data head:
   xbox   ybox   width   height  onpix   xbar   ybar   x2bar  y2bar   xybar   \
0      2      7       4       2       2      6      7      3       5      11   
1      6     11       8       8       8      8      4      5       3       6   
2      2      4       3       3       2      6      6      7       3       6   
3      3      6       4       4       3      8      7      6       5      11   
4      4     10       6       1       7      8      6      2       4       9   

   x2ybar  xy2bar  xedge   xedgey  yedge   yedgex letter  
0       7       6       2       7       2       6      N  
1       9      10       8       6       2       8      M  
2      10       7       2       9       1       8      G  
3       6       7       3       8       4       8      C  
4       7      10       8       6       2       5      M  


In [10]:
# 7. Evaluate quality using TSTR and other metrics
print("\n# 7. Evaluate quality using TSTR and other metrics")
print("Running evaluation...")
evaluation_results = evaluate_ctgan(data, synthetic_data, target_column=target_column)
print_evaluation_results(evaluation_results)


2025-04-03 09:51:41,695 - INFO - Encoded categorical target with mapping: {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'J': 9, 'K': 10, 'L': 11, 'M': 12, 'N': 13, 'O': 14, 'P': 15, 'Q': 16, 'R': 17, 'S': 18, 'T': 19, 'U': 20, 'V': 21, 'W': 22, 'X': 23, 'Y': 24, 'Z': 25}



# 7. Evaluate quality using TSTR and other metrics
Running evaluation...


Parameters: { "use_label_encoder" } are not used.

2025-04-03 09:53:09,657 - INFO - Encoded categorical targets with mapping: {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'J': 9, 'K': 10, 'L': 11, 'M': 12, 'N': 13, 'O': 14, 'P': 15, 'Q': 16, 'R': 17, 'S': 18, 'T': 19, 'U': 20, 'V': 21, 'W': 22, 'X': 23, 'Y': 24, 'Z': 25}
Parameters: { "use_label_encoder" } are not used.

2025-04-03 09:53:25,266 - INFO - CTGAN Evaluation Results:
2025-04-03 09:53:25,269 - INFO - 
Likelihood Fitness: Not applicable for fully categorical data
2025-04-03 09:53:25,271 - INFO - 
Statistical Similarity Metrics:
2025-04-03 09:53:25,272 - INFO -   - Jensen-Shannon Divergence Mean (Categorical): 0.1340
2025-04-03 09:53:25,275 - INFO - 
Machine Learning Efficacy Metrics on Real Data:
2025-04-03 09:53:25,275 - INFO -   LogisticRegression:
2025-04-03 09:53:25,277 - INFO -     - Accuracy: 0.8528
2025-04-03 09:53:25,279 - INFO -     - F1: 0.8528
2025-04-03 09:53:25,281 - INFO -   RandomFor

In [11]:
# 8. Extract and display TSTR results specifically
print("\n# 8. TSTR Performance Results")
tstr_results = get_tstr_results(evaluation_results)
if tstr_results is not None:
    print(tstr_results)


# 8. TSTR Performance Results
                    Accuracy        F1
LogisticRegression   0.32445  0.298618
RandomForest         0.23690  0.206445
MLP                  0.29400  0.271157
XGBoost              0.28305  0.262957


In [12]:
# 9. Save the synthetic data
print("\n# 9. Save synthetic data")
output_path = "letter_synthetic.csv"
synthetic_data.to_csv(output_path, index=False)
print(f"Synthetic data saved to {output_path}")

print("\nTest completed successfully!")


# 9. Save synthetic data
Synthetic data saved to letter_synthetic.csv

Test completed successfully!
